aboutsummaryrefslogtreecommitdiff
path: root/APEX_1.4/common/include/ApexCudaDefs.h
diff options
context:
space:
mode:
authorgit perforce import user <a@b>2016-10-25 12:29:14 -0600
committerSheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>2016-10-25 18:56:37 -0500
commit3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
treefa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCudaDefs.h
downloadphysx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
Initial commit:
PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
Diffstat (limited to 'APEX_1.4/common/include/ApexCudaDefs.h')
-rw-r--r--APEX_1.4/common/include/ApexCudaDefs.h310
1 files changed, 310 insertions, 0 deletions
diff --git a/APEX_1.4/common/include/ApexCudaDefs.h b/APEX_1.4/common/include/ApexCudaDefs.h
new file mode 100644
index 00000000..6a089265
--- /dev/null
+++ b/APEX_1.4/common/include/ApexCudaDefs.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto. Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+
+#ifndef APEX_CUDA_DEFS_H
+#define APEX_CUDA_DEFS_H
+
+#include <cuda.h>
+
+const unsigned int MAX_CONST_MEM_SIZE = 65536;
+
+const unsigned int APEX_CUDA_MEM_ALIGNMENT = 256;
+const unsigned int APEX_CUDA_TEX_MEM_ALIGNMENT = 512;
+
+const unsigned int MAX_SMEM_BANKS = 32;
+
+
+#define APEX_CUDA_ALIGN_UP(value, alignment) (((value) + (alignment)-1) & ~((alignment)-1))
+#define APEX_CUDA_MEM_ALIGN_UP_32BIT(count) APEX_CUDA_ALIGN_UP(count, APEX_CUDA_MEM_ALIGNMENT >> 2)
+
+const unsigned int LOG2_WARP_SIZE = 5;
+const unsigned int WARP_SIZE = (1U << LOG2_WARP_SIZE);
+
+//if you would like to make this value larger than 32 for future GPUs,
+//then you'll need to fix some kernels (like reduce and scan) to support more than 32 warps per block!!!
+const unsigned int MAX_WARPS_PER_BLOCK = 32;
+const unsigned int MAX_THREADS_PER_BLOCK = (MAX_WARPS_PER_BLOCK << LOG2_WARP_SIZE);
+
+const unsigned int MAX_BOUND_BLOCKS = 64;
+
+//uncomment this line to force bound kernels to use defined number of CTAs
+//#define APEX_CUDA_FORCED_BLOCKS 60
+
+
+namespace nvidia
+{
+namespace apex
+{
+
+struct ApexCudaMemFlags
+{
+ enum Enum
+ {
+ UNUSED = 0,
+ IN = 0x01,
+ OUT = 0x02,
+ IN_OUT = IN | OUT
+ };
+};
+
+#ifndef __CUDACC__
+
+class ApexCudaArray : public UserAllocated
+{
+ PX_NOCOPY(ApexCudaArray)
+
+ void init()
+ {
+ switch (mDesc.Format)
+ {
+ case CU_AD_FORMAT_UNSIGNED_INT8:
+ case CU_AD_FORMAT_SIGNED_INT8:
+ mElemSize = 1;
+ break;
+ case CU_AD_FORMAT_UNSIGNED_INT16:
+ case CU_AD_FORMAT_SIGNED_INT16:
+ case CU_AD_FORMAT_HALF:
+ mElemSize = 2;
+ break;
+ case CU_AD_FORMAT_UNSIGNED_INT32:
+ case CU_AD_FORMAT_SIGNED_INT32:
+ case CU_AD_FORMAT_FLOAT:
+ mElemSize = 4;
+ break;
+ default:
+ PX_ALWAYS_ASSERT();
+ mElemSize = 0;
+ break;
+ };
+ mElemSize *= mDesc.NumChannels;
+ }
+
+public:
+ ApexCudaArray() : mCuArray(NULL), mHasOwnership(false), mElemSize(0) {}
+ ~ApexCudaArray() { release(); }
+
+ void assign(CUarray cuArray, bool bTakeOwnership)
+ {
+ release();
+
+ mCuArray = cuArray;
+ mHasOwnership = bTakeOwnership;
+ CUT_SAFE_CALL(cuArray3DGetDescriptor(&mDesc, mCuArray));
+ init();
+ }
+
+ void create(CUDA_ARRAY3D_DESCRIPTOR desc)
+ {
+ if (mCuArray != NULL && mHasOwnership &&
+ mDesc.Width == desc.Width && mDesc.Height == desc.Height && mDesc.Depth == desc.Depth &&
+ mDesc.Format == desc.Format && mDesc.NumChannels == desc.NumChannels && mDesc.Flags == desc.Flags)
+ {
+ return;
+ }
+ release();
+
+ // Allocate CUDA 3d array in device memory
+ mDesc = desc;
+ CUT_SAFE_CALL(cuArray3DCreate(&mCuArray, &mDesc));
+ mHasOwnership = true;
+ init();
+ }
+
+ void create(CUarray_format format, unsigned int numChannels, unsigned int width, unsigned int height, unsigned int depth = 0, bool surfUsage = false)
+ {
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+ desc.Format = format;
+ desc.NumChannels = numChannels;
+ desc.Width = width;
+ desc.Height = height;
+ desc.Depth = depth;
+ desc.Flags = surfUsage ? CUDA_ARRAY3D_SURFACE_LDST : 0u;
+
+ create(desc);
+ }
+
+ void release()
+ {
+ if (mCuArray != NULL)
+ {
+ if (mHasOwnership)
+ {
+ CUT_SAFE_CALL(cuArrayDestroy(mCuArray));
+ }
+ mCuArray = NULL;
+ mHasOwnership = false;
+ mElemSize = 0;
+ }
+ }
+
+ void copyToHost(CUstream stream, void* dstHost, size_t dstPitch = 0, size_t dstHeight = 0,
+ size_t copyWidth = 0, size_t copyHeight = 0, size_t copyDepth = 0)
+ {
+ if (mDesc.Width > 0)
+ {
+ if (mDesc.Height > 0)
+ {
+ if (mDesc.Depth > 0)
+ {
+ //3D
+ CUDA_MEMCPY3D copyDesc;
+ copyDesc.WidthInBytes = size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize;
+ copyDesc.Height = copyHeight ? copyHeight : mDesc.Height;
+ copyDesc.Depth = copyDepth ? copyDepth : mDesc.Depth;
+
+ copyDesc.srcXInBytes = copyDesc.srcY = copyDesc.srcZ = copyDesc.srcLOD = 0;
+ copyDesc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyDesc.srcArray = mCuArray;
+
+ copyDesc.dstXInBytes = copyDesc.dstY = copyDesc.dstZ = copyDesc.dstLOD = 0;
+ copyDesc.dstMemoryType = CU_MEMORYTYPE_HOST;
+ copyDesc.dstHost = dstHost;
+ copyDesc.dstPitch = (dstPitch > 0) ? dstPitch : copyDesc.WidthInBytes;
+ copyDesc.dstHeight = (dstHeight > 0) ? dstHeight : copyDesc.Height;
+ CUT_SAFE_CALL(cuMemcpy3DAsync(&copyDesc, stream));
+ }
+ else
+ {
+ //2D
+ CUDA_MEMCPY2D copyDesc;
+ copyDesc.WidthInBytes = size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize;
+ copyDesc.Height = copyHeight ? copyHeight : mDesc.Height;
+
+ copyDesc.srcXInBytes = copyDesc.srcY = 0;
+ copyDesc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyDesc.srcArray = mCuArray;
+
+ copyDesc.dstXInBytes = copyDesc.dstY = 0;
+ copyDesc.dstMemoryType = CU_MEMORYTYPE_HOST;
+ copyDesc.dstHost = dstHost;
+ copyDesc.dstPitch = (dstPitch > 0) ? dstPitch : copyDesc.WidthInBytes;
+ CUT_SAFE_CALL(cuMemcpy2DAsync(&copyDesc, stream));
+ }
+ }
+ else
+ {
+ //1D
+ CUT_SAFE_CALL(cuMemcpyAtoHAsync(dstHost, mCuArray, 0, size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize, stream));
+ }
+ }
+ }
+
+ void copyFromHost(CUstream stream, const void* srcHost, size_t srcPitch = 0, size_t srcHeight = 0)
+ {
+ if (mDesc.Width > 0)
+ {
+ if (mDesc.Height > 0)
+ {
+ if (mDesc.Depth > 0)
+ {
+ //3D
+ CUDA_MEMCPY3D copyDesc;
+ copyDesc.WidthInBytes = size_t(mDesc.Width) * mElemSize;
+ copyDesc.Height = mDesc.Height;
+ copyDesc.Depth = mDesc.Depth;
+
+ copyDesc.srcXInBytes = copyDesc.srcY = copyDesc.srcZ = copyDesc.srcLOD = 0;
+ copyDesc.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyDesc.srcHost = srcHost;
+ copyDesc.srcPitch = (srcPitch > 0) ? srcPitch : copyDesc.WidthInBytes;
+ copyDesc.srcHeight = (srcHeight > 0) ? srcHeight : copyDesc.Height;
+
+ copyDesc.dstXInBytes = copyDesc.dstY = copyDesc.dstZ = copyDesc.dstLOD = 0;
+ copyDesc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyDesc.dstArray = mCuArray;
+
+ CUT_SAFE_CALL(cuMemcpy3DAsync(&copyDesc, stream));
+ }
+ else
+ {
+ //2D
+ CUDA_MEMCPY2D copyDesc;
+ copyDesc.WidthInBytes = size_t(mDesc.Width) * mElemSize;
+ copyDesc.Height = mDesc.Height;
+
+ copyDesc.srcXInBytes = copyDesc.srcY = 0;
+ copyDesc.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyDesc.srcHost = srcHost;
+ copyDesc.srcPitch = (srcPitch > 0) ? srcPitch : copyDesc.WidthInBytes;
+
+ copyDesc.dstXInBytes = copyDesc.dstY = 0;
+ copyDesc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyDesc.dstArray = mCuArray;
+
+ CUT_SAFE_CALL(cuMemcpy2DAsync(&copyDesc, stream));
+ }
+ }
+ else
+ {
+ //1D
+ CUT_SAFE_CALL(cuMemcpyHtoAAsync(mCuArray, 0, srcHost, size_t(mDesc.Width) * mElemSize, stream));
+ }
+ }
+ }
+
+ void copyToArray(CUstream stream, CUarray dstArray)
+ {
+ //copy array to array
+ CUDA_MEMCPY3D desc;
+ desc.srcXInBytes = desc.srcY = desc.srcZ = desc.srcLOD = 0;
+ desc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+ desc.srcArray = mCuArray;
+
+ desc.dstXInBytes = desc.dstY = desc.dstZ = desc.dstLOD = 0;
+ desc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ desc.dstArray = dstArray;
+
+ desc.WidthInBytes = size_t(mDesc.Width) * mElemSize;
+ desc.Height = mDesc.Height;
+ desc.Depth = mDesc.Depth;
+ CUT_SAFE_CALL(cuMemcpy3DAsync(&desc, stream));
+ }
+
+ PX_INLINE CUarray getCuArray() const
+ {
+ return mCuArray;
+ }
+ PX_INLINE bool isValid() const
+ {
+ return (mCuArray != NULL);
+ }
+
+ PX_INLINE unsigned int getWidth() const { return (unsigned int)mDesc.Width; }
+ PX_INLINE unsigned int getHeight() const { return (unsigned int)mDesc.Height; }
+ PX_INLINE unsigned int getDepth() const { return (unsigned int)mDesc.Depth; }
+ PX_INLINE CUarray_format getFormat() const { return mDesc.Format; }
+ PX_INLINE unsigned int getNumChannels() const { return mDesc.NumChannels; }
+
+ PX_INLINE bool hasOwnership() const { return mHasOwnership; }
+
+ PX_INLINE const CUDA_ARRAY3D_DESCRIPTOR& getDesc() const { return mDesc; }
+
+ PX_INLINE size_t getByteSize() const
+ {
+ size_t size = mDesc.Width * mElemSize;
+ if (mDesc.Height > 0) size *= mDesc.Height;
+ if (mDesc.Depth > 0) size *= mDesc.Depth;
+ return size;
+ }
+
+private:
+ CUarray mCuArray;
+ bool mHasOwnership;
+ unsigned int mElemSize;
+ CUDA_ARRAY3D_DESCRIPTOR mDesc;
+};
+
+#endif //__CUDACC__
+
+}
+} // end namespace nvidia::apex
+
+#endif //APEX_CUDA_DEFS_H