diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCudaDefs.h | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'APEX_1.4/common/include/ApexCudaDefs.h')
| -rw-r--r-- | APEX_1.4/common/include/ApexCudaDefs.h | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/APEX_1.4/common/include/ApexCudaDefs.h b/APEX_1.4/common/include/ApexCudaDefs.h new file mode 100644 index 00000000..6a089265 --- /dev/null +++ b/APEX_1.4/common/include/ApexCudaDefs.h @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + + +#ifndef APEX_CUDA_DEFS_H +#define APEX_CUDA_DEFS_H + +#include <cuda.h> + +const unsigned int MAX_CONST_MEM_SIZE = 65536; + +const unsigned int APEX_CUDA_MEM_ALIGNMENT = 256; +const unsigned int APEX_CUDA_TEX_MEM_ALIGNMENT = 512; + +const unsigned int MAX_SMEM_BANKS = 32; + + +#define APEX_CUDA_ALIGN_UP(value, alignment) (((value) + (alignment)-1) & ~((alignment)-1)) +#define APEX_CUDA_MEM_ALIGN_UP_32BIT(count) APEX_CUDA_ALIGN_UP(count, APEX_CUDA_MEM_ALIGNMENT >> 2) + +const unsigned int LOG2_WARP_SIZE = 5; +const unsigned int WARP_SIZE = (1U << LOG2_WARP_SIZE); + +//if you would like to make this value larger than 32 for future GPUs, +//then you'll need to fix some kernels (like reduce and scan) to support more than 32 warps per block!!! +const unsigned int MAX_WARPS_PER_BLOCK = 32; +const unsigned int MAX_THREADS_PER_BLOCK = (MAX_WARPS_PER_BLOCK << LOG2_WARP_SIZE); + +const unsigned int MAX_BOUND_BLOCKS = 64; + +//uncomment this line to force bound kernels to use defined number of CTAs +//#define APEX_CUDA_FORCED_BLOCKS 60 + + +namespace nvidia +{ +namespace apex +{ + +struct ApexCudaMemFlags +{ + enum Enum + { + UNUSED = 0, + IN = 0x01, + OUT = 0x02, + IN_OUT = IN | OUT + }; +}; + +#ifndef __CUDACC__ + +class ApexCudaArray : public UserAllocated +{ + PX_NOCOPY(ApexCudaArray) + + void init() + { + switch (mDesc.Format) + { + case CU_AD_FORMAT_UNSIGNED_INT8: + case CU_AD_FORMAT_SIGNED_INT8: + mElemSize = 1; + break; + case CU_AD_FORMAT_UNSIGNED_INT16: + case CU_AD_FORMAT_SIGNED_INT16: + case CU_AD_FORMAT_HALF: + mElemSize = 2; + break; + case CU_AD_FORMAT_UNSIGNED_INT32: + case CU_AD_FORMAT_SIGNED_INT32: + case CU_AD_FORMAT_FLOAT: + mElemSize = 4; + break; + default: + PX_ALWAYS_ASSERT(); + mElemSize = 0; + break; + }; + mElemSize *= mDesc.NumChannels; + } + +public: + ApexCudaArray() : mCuArray(NULL), mHasOwnership(false), mElemSize(0) {} + ~ApexCudaArray() { release(); } + + void assign(CUarray cuArray, bool bTakeOwnership) + { + release(); + + mCuArray = cuArray; + mHasOwnership = bTakeOwnership; + CUT_SAFE_CALL(cuArray3DGetDescriptor(&mDesc, mCuArray)); + init(); + } + + void create(CUDA_ARRAY3D_DESCRIPTOR desc) + { + if (mCuArray != NULL && mHasOwnership && + mDesc.Width == desc.Width && mDesc.Height == desc.Height && mDesc.Depth == desc.Depth && + mDesc.Format == desc.Format && mDesc.NumChannels == desc.NumChannels && mDesc.Flags == desc.Flags) + { + return; + } + release(); + + // Allocate CUDA 3d array in device memory + mDesc = desc; + CUT_SAFE_CALL(cuArray3DCreate(&mCuArray, &mDesc)); + mHasOwnership = true; + init(); + } + + void create(CUarray_format format, unsigned int numChannels, unsigned int width, unsigned int height, unsigned int depth = 0, bool surfUsage = false) + { + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = format; + desc.NumChannels = numChannels; + desc.Width = width; + desc.Height = height; + desc.Depth = depth; + desc.Flags = surfUsage ? CUDA_ARRAY3D_SURFACE_LDST : 0u; + + create(desc); + } + + void release() + { + if (mCuArray != NULL) + { + if (mHasOwnership) + { + CUT_SAFE_CALL(cuArrayDestroy(mCuArray)); + } + mCuArray = NULL; + mHasOwnership = false; + mElemSize = 0; + } + } + + void copyToHost(CUstream stream, void* dstHost, size_t dstPitch = 0, size_t dstHeight = 0, + size_t copyWidth = 0, size_t copyHeight = 0, size_t copyDepth = 0) + { + if (mDesc.Width > 0) + { + if (mDesc.Height > 0) + { + if (mDesc.Depth > 0) + { + //3D + CUDA_MEMCPY3D copyDesc; + copyDesc.WidthInBytes = size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize; + copyDesc.Height = copyHeight ? copyHeight : mDesc.Height; + copyDesc.Depth = copyDepth ? copyDepth : mDesc.Depth; + + copyDesc.srcXInBytes = copyDesc.srcY = copyDesc.srcZ = copyDesc.srcLOD = 0; + copyDesc.srcMemoryType = CU_MEMORYTYPE_ARRAY; + copyDesc.srcArray = mCuArray; + + copyDesc.dstXInBytes = copyDesc.dstY = copyDesc.dstZ = copyDesc.dstLOD = 0; + copyDesc.dstMemoryType = CU_MEMORYTYPE_HOST; + copyDesc.dstHost = dstHost; + copyDesc.dstPitch = (dstPitch > 0) ? dstPitch : copyDesc.WidthInBytes; + copyDesc.dstHeight = (dstHeight > 0) ? dstHeight : copyDesc.Height; + CUT_SAFE_CALL(cuMemcpy3DAsync(©Desc, stream)); + } + else + { + //2D + CUDA_MEMCPY2D copyDesc; + copyDesc.WidthInBytes = size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize; + copyDesc.Height = copyHeight ? copyHeight : mDesc.Height; + + copyDesc.srcXInBytes = copyDesc.srcY = 0; + copyDesc.srcMemoryType = CU_MEMORYTYPE_ARRAY; + copyDesc.srcArray = mCuArray; + + copyDesc.dstXInBytes = copyDesc.dstY = 0; + copyDesc.dstMemoryType = CU_MEMORYTYPE_HOST; + copyDesc.dstHost = dstHost; + copyDesc.dstPitch = (dstPitch > 0) ? dstPitch : copyDesc.WidthInBytes; + CUT_SAFE_CALL(cuMemcpy2DAsync(©Desc, stream)); + } + } + else + { + //1D + CUT_SAFE_CALL(cuMemcpyAtoHAsync(dstHost, mCuArray, 0, size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize, stream)); + } + } + } + + void copyFromHost(CUstream stream, const void* srcHost, size_t srcPitch = 0, size_t srcHeight = 0) + { + if (mDesc.Width > 0) + { + if (mDesc.Height > 0) + { + if (mDesc.Depth > 0) + { + //3D + CUDA_MEMCPY3D copyDesc; + copyDesc.WidthInBytes = size_t(mDesc.Width) * mElemSize; + copyDesc.Height = mDesc.Height; + copyDesc.Depth = mDesc.Depth; + + copyDesc.srcXInBytes = copyDesc.srcY = copyDesc.srcZ = copyDesc.srcLOD = 0; + copyDesc.srcMemoryType = CU_MEMORYTYPE_HOST; + copyDesc.srcHost = srcHost; + copyDesc.srcPitch = (srcPitch > 0) ? srcPitch : copyDesc.WidthInBytes; + copyDesc.srcHeight = (srcHeight > 0) ? srcHeight : copyDesc.Height; + + copyDesc.dstXInBytes = copyDesc.dstY = copyDesc.dstZ = copyDesc.dstLOD = 0; + copyDesc.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyDesc.dstArray = mCuArray; + + CUT_SAFE_CALL(cuMemcpy3DAsync(©Desc, stream)); + } + else + { + //2D + CUDA_MEMCPY2D copyDesc; + copyDesc.WidthInBytes = size_t(mDesc.Width) * mElemSize; + copyDesc.Height = mDesc.Height; + + copyDesc.srcXInBytes = copyDesc.srcY = 0; + copyDesc.srcMemoryType = CU_MEMORYTYPE_HOST; + copyDesc.srcHost = srcHost; + copyDesc.srcPitch = (srcPitch > 0) ? srcPitch : copyDesc.WidthInBytes; + + copyDesc.dstXInBytes = copyDesc.dstY = 0; + copyDesc.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyDesc.dstArray = mCuArray; + + CUT_SAFE_CALL(cuMemcpy2DAsync(©Desc, stream)); + } + } + else + { + //1D + CUT_SAFE_CALL(cuMemcpyHtoAAsync(mCuArray, 0, srcHost, size_t(mDesc.Width) * mElemSize, stream)); + } + } + } + + void copyToArray(CUstream stream, CUarray dstArray) + { + //copy array to array + CUDA_MEMCPY3D desc; + desc.srcXInBytes = desc.srcY = desc.srcZ = desc.srcLOD = 0; + desc.srcMemoryType = CU_MEMORYTYPE_ARRAY; + desc.srcArray = mCuArray; + + desc.dstXInBytes = desc.dstY = desc.dstZ = desc.dstLOD = 0; + desc.dstMemoryType = CU_MEMORYTYPE_ARRAY; + desc.dstArray = dstArray; + + desc.WidthInBytes = size_t(mDesc.Width) * mElemSize; + desc.Height = mDesc.Height; + desc.Depth = mDesc.Depth; + CUT_SAFE_CALL(cuMemcpy3DAsync(&desc, stream)); + } + + PX_INLINE CUarray getCuArray() const + { + return mCuArray; + } + PX_INLINE bool isValid() const + { + return (mCuArray != NULL); + } + + PX_INLINE unsigned int getWidth() const { return (unsigned int)mDesc.Width; } + PX_INLINE unsigned int getHeight() const { return (unsigned int)mDesc.Height; } + PX_INLINE unsigned int getDepth() const { return (unsigned int)mDesc.Depth; } + PX_INLINE CUarray_format getFormat() const { return mDesc.Format; } + PX_INLINE unsigned int getNumChannels() const { return mDesc.NumChannels; } + + PX_INLINE bool hasOwnership() const { return mHasOwnership; } + + PX_INLINE const CUDA_ARRAY3D_DESCRIPTOR& getDesc() const { return mDesc; } + + PX_INLINE size_t getByteSize() const + { + size_t size = mDesc.Width * mElemSize; + if (mDesc.Height > 0) size *= mDesc.Height; + if (mDesc.Depth > 0) size *= mDesc.Depth; + return size; + } + +private: + CUarray mCuArray; + bool mHasOwnership; + unsigned int mElemSize; + CUDA_ARRAY3D_DESCRIPTOR mDesc; +}; + +#endif //__CUDACC__ + +} +} // end namespace nvidia::apex + +#endif //APEX_CUDA_DEFS_H |