Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCudaDefs.h
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
1 files changed, 310 insertions, 0 deletions
diff --git a/APEX_1.4/common/include/ApexCudaDefs.h b/APEX_1.4/common/include/ApexCudaDefs.h
new file mode 100644
index 00000000..6a089265
--- /dev/null
+++ b/APEX_1.4/common/include/ApexCudaDefs.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+
+#ifndef APEX_CUDA_DEFS_H
+#define APEX_CUDA_DEFS_H
+
+#include <cuda.h>
+
+const unsigned int MAX_CONST_MEM_SIZE = 65536;
+
+const unsigned int APEX_CUDA_MEM_ALIGNMENT = 256;
+const unsigned int APEX_CUDA_TEX_MEM_ALIGNMENT = 512;
+
+const unsigned int MAX_SMEM_BANKS = 32;
+
+
+#define APEX_CUDA_ALIGN_UP(value, alignment) (((value) + (alignment)-1) & ~((alignment)-1))
+#define APEX_CUDA_MEM_ALIGN_UP_32BIT(count) APEX_CUDA_ALIGN_UP(count, APEX_CUDA_MEM_ALIGNMENT >> 2)
+
+const unsigned int LOG2_WARP_SIZE = 5;
+const unsigned int WARP_SIZE = (1U << LOG2_WARP_SIZE);
+
+//if you would like to make this value larger than 32 for future GPUs, 
+//then you'll need to fix some kernels (like reduce and scan) to support more than 32 warps per block!!!
+const unsigned int MAX_WARPS_PER_BLOCK = 32;
+const unsigned int MAX_THREADS_PER_BLOCK = (MAX_WARPS_PER_BLOCK << LOG2_WARP_SIZE);
+
+const unsigned int MAX_BOUND_BLOCKS = 64;
+
+//uncomment this line to force bound kernels to use defined number of CTAs
+//#define APEX_CUDA_FORCED_BLOCKS 60
+
+
+namespace nvidia
+{
+namespace apex
+{
+
+struct ApexCudaMemFlags
+{
+	enum Enum
+	{
+		UNUSED = 0,
+		IN = 0x01,
+		OUT = 0x02,
+		IN_OUT = IN | OUT
+	};
+};
+
+#ifndef __CUDACC__
+
+class ApexCudaArray : public UserAllocated
+{
+	PX_NOCOPY(ApexCudaArray)
+
+	void init()
+	{
+		switch (mDesc.Format)
+		{
+		case CU_AD_FORMAT_UNSIGNED_INT8:
+		case CU_AD_FORMAT_SIGNED_INT8:
+			mElemSize = 1;
+			break;
+		case CU_AD_FORMAT_UNSIGNED_INT16:
+		case CU_AD_FORMAT_SIGNED_INT16:
+		case CU_AD_FORMAT_HALF:
+			mElemSize = 2;
+			break;
+		case CU_AD_FORMAT_UNSIGNED_INT32:
+		case CU_AD_FORMAT_SIGNED_INT32:
+		case CU_AD_FORMAT_FLOAT:
+			mElemSize = 4;
+			break;
+		default:
+			PX_ALWAYS_ASSERT();
+			mElemSize = 0;
+			break;
+		};
+		mElemSize *= mDesc.NumChannels;
+	}
+
+public:
+	ApexCudaArray() : mCuArray(NULL), mHasOwnership(false), mElemSize(0) {}
+	~ApexCudaArray() { release(); }
+
+	void assign(CUarray cuArray, bool bTakeOwnership)
+	{
+		release();
+
+		mCuArray = cuArray;
+		mHasOwnership = bTakeOwnership;
+		CUT_SAFE_CALL(cuArray3DGetDescriptor(&mDesc, mCuArray));
+		init();
+	}
+
+	void create(CUDA_ARRAY3D_DESCRIPTOR desc)
+	{
+		if (mCuArray != NULL && mHasOwnership && 
+			mDesc.Width == desc.Width && mDesc.Height == desc.Height && mDesc.Depth == desc.Depth && 
+			mDesc.Format == desc.Format && mDesc.NumChannels == desc.NumChannels && mDesc.Flags == desc.Flags)
+		{
+			return;
+		}
+		release();
+
+		// Allocate CUDA 3d array in device memory
+		mDesc = desc;
+		CUT_SAFE_CALL(cuArray3DCreate(&mCuArray, &mDesc));
+		mHasOwnership = true;
+		init();
+	}
+
+	void create(CUarray_format format, unsigned int numChannels, unsigned int width, unsigned int height, unsigned int depth = 0, bool surfUsage = false)
+	{
+		CUDA_ARRAY3D_DESCRIPTOR desc;
+		desc.Format = format;
+		desc.NumChannels = numChannels;
+		desc.Width = width;
+		desc.Height = height;
+		desc.Depth = depth;
+		desc.Flags = surfUsage ? CUDA_ARRAY3D_SURFACE_LDST : 0u;
+
+		create(desc);
+	}
+
+	void release()
+	{
+		if (mCuArray != NULL)
+		{
+			if (mHasOwnership)
+			{
+				CUT_SAFE_CALL(cuArrayDestroy(mCuArray));
+			}
+			mCuArray = NULL;
+			mHasOwnership = false;
+			mElemSize = 0;
+		}
+	}
+
+	void copyToHost(CUstream stream, void* dstHost, size_t dstPitch = 0, size_t dstHeight = 0, 
+		size_t copyWidth = 0, size_t copyHeight = 0, size_t copyDepth = 0)
+	{
+		if (mDesc.Width > 0)
+		{
+			if (mDesc.Height > 0)
+			{
+				if (mDesc.Depth > 0)
+				{
+					//3D
+					CUDA_MEMCPY3D copyDesc;
+					copyDesc.WidthInBytes = size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize;
+					copyDesc.Height = copyHeight ? copyHeight : mDesc.Height;
+					copyDesc.Depth = copyDepth ? copyDepth : mDesc.Depth;
+
+					copyDesc.srcXInBytes = copyDesc.srcY = copyDesc.srcZ = copyDesc.srcLOD = 0;
+					copyDesc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+					copyDesc.srcArray = mCuArray;
+
+					copyDesc.dstXInBytes = copyDesc.dstY = copyDesc.dstZ = copyDesc.dstLOD = 0;
+					copyDesc.dstMemoryType = CU_MEMORYTYPE_HOST;
+					copyDesc.dstHost = dstHost;
+					copyDesc.dstPitch = (dstPitch > 0) ? dstPitch : copyDesc.WidthInBytes;
+					copyDesc.dstHeight = (dstHeight > 0) ? dstHeight : copyDesc.Height;
+					CUT_SAFE_CALL(cuMemcpy3DAsync(&copyDesc, stream));
+				}
+				else
+				{
+					//2D
+					CUDA_MEMCPY2D copyDesc;
+					copyDesc.WidthInBytes = size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize;
+					copyDesc.Height = copyHeight ? copyHeight : mDesc.Height;
+
+					copyDesc.srcXInBytes = copyDesc.srcY = 0;
+					copyDesc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+					copyDesc.srcArray = mCuArray;
+
+					copyDesc.dstXInBytes = copyDesc.dstY = 0;
+					copyDesc.dstMemoryType = CU_MEMORYTYPE_HOST;
+					copyDesc.dstHost = dstHost;
+					copyDesc.dstPitch = (dstPitch > 0) ? dstPitch : copyDesc.WidthInBytes;
+					CUT_SAFE_CALL(cuMemcpy2DAsync(&copyDesc, stream));
+				}
+			}
+			else
+			{
+				//1D
+				CUT_SAFE_CALL(cuMemcpyAtoHAsync(dstHost, mCuArray, 0, size_t(copyWidth ? copyWidth : mDesc.Width) * mElemSize, stream));
+			}
+		}
+	}
+
+	void copyFromHost(CUstream stream, const void* srcHost, size_t srcPitch = 0, size_t srcHeight = 0)
+	{
+		if (mDesc.Width > 0)
+		{
+			if (mDesc.Height > 0)
+			{
+				if (mDesc.Depth > 0)
+				{
+					//3D
+					CUDA_MEMCPY3D copyDesc;
+					copyDesc.WidthInBytes = size_t(mDesc.Width) * mElemSize;
+					copyDesc.Height = mDesc.Height;
+					copyDesc.Depth = mDesc.Depth;
+
+					copyDesc.srcXInBytes = copyDesc.srcY = copyDesc.srcZ = copyDesc.srcLOD = 0;
+					copyDesc.srcMemoryType = CU_MEMORYTYPE_HOST;
+					copyDesc.srcHost = srcHost;
+					copyDesc.srcPitch = (srcPitch > 0) ? srcPitch : copyDesc.WidthInBytes;
+					copyDesc.srcHeight = (srcHeight > 0) ? srcHeight : copyDesc.Height;
+
+					copyDesc.dstXInBytes = copyDesc.dstY = copyDesc.dstZ = copyDesc.dstLOD = 0;
+					copyDesc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+					copyDesc.dstArray = mCuArray;
+
+					CUT_SAFE_CALL(cuMemcpy3DAsync(&copyDesc, stream));
+				}
+				else
+				{
+					//2D
+					CUDA_MEMCPY2D copyDesc;
+					copyDesc.WidthInBytes = size_t(mDesc.Width) * mElemSize;
+					copyDesc.Height = mDesc.Height;
+
+					copyDesc.srcXInBytes = copyDesc.srcY = 0;
+					copyDesc.srcMemoryType = CU_MEMORYTYPE_HOST;
+					copyDesc.srcHost = srcHost;
+					copyDesc.srcPitch = (srcPitch > 0) ? srcPitch : copyDesc.WidthInBytes;
+
+					copyDesc.dstXInBytes = copyDesc.dstY = 0;
+					copyDesc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+					copyDesc.dstArray = mCuArray;
+
+					CUT_SAFE_CALL(cuMemcpy2DAsync(&copyDesc, stream));
+				}
+			}
+			else
+			{
+				//1D
+				CUT_SAFE_CALL(cuMemcpyHtoAAsync(mCuArray, 0, srcHost, size_t(mDesc.Width) * mElemSize, stream));
+			}
+		}
+	}
+
+	void copyToArray(CUstream stream, CUarray dstArray)
+	{
+		//copy array to array
+		CUDA_MEMCPY3D desc;
+		desc.srcXInBytes = desc.srcY = desc.srcZ = desc.srcLOD = 0;
+		desc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+		desc.srcArray = mCuArray;
+
+		desc.dstXInBytes = desc.dstY = desc.dstZ = desc.dstLOD = 0;
+		desc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+		desc.dstArray = dstArray;
+
+		desc.WidthInBytes = size_t(mDesc.Width) * mElemSize;
+		desc.Height = mDesc.Height;
+		desc.Depth = mDesc.Depth;
+		CUT_SAFE_CALL(cuMemcpy3DAsync(&desc, stream));
+	}
+
+	PX_INLINE CUarray getCuArray() const
+	{
+		return mCuArray;
+	}
+	PX_INLINE bool isValid() const
+	{
+		return (mCuArray != NULL);
+	}
+
+	PX_INLINE unsigned int	getWidth() const  { return (unsigned int)mDesc.Width; }
+	PX_INLINE unsigned int	getHeight() const { return (unsigned int)mDesc.Height; }
+	PX_INLINE unsigned int	getDepth() const  { return (unsigned int)mDesc.Depth; }
+	PX_INLINE CUarray_format	getFormat() const { return mDesc.Format; }
+	PX_INLINE unsigned int	getNumChannels() const { return mDesc.NumChannels; }
+
+	PX_INLINE bool			hasOwnership() const { return mHasOwnership; }
+
+	PX_INLINE const CUDA_ARRAY3D_DESCRIPTOR& getDesc() const { return mDesc; }
+
+	PX_INLINE size_t		getByteSize() const
+	{
+		size_t size = mDesc.Width * mElemSize;
+		if (mDesc.Height > 0) size *= mDesc.Height;
+		if (mDesc.Depth > 0) size *= mDesc.Depth;
+		return size;
+	}
+
+private:
+	CUarray					mCuArray;
+	bool					mHasOwnership;
+	unsigned int			mElemSize;
+	CUDA_ARRAY3D_DESCRIPTOR	mDesc;
+};
+
+#endif //__CUDACC__
+
+}
+} // end namespace nvidia::apex
+
+#endif //APEX_CUDA_DEFS_H
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCudaDefs.h
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip