Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCudaWrapper.h
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
1 files changed, 1232 insertions, 0 deletions
diff --git a/APEX_1.4/common/include/ApexCudaWrapper.h b/APEX_1.4/common/include/ApexCudaWrapper.h
new file mode 100644
index 00000000..c455fbaf
--- /dev/null
+++ b/APEX_1.4/common/include/ApexCudaWrapper.h
@@ -0,0 +1,1232 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+
+#ifndef __APEX_CUDA_WRAPPER_H__
+#define __APEX_CUDA_WRAPPER_H__
+
+#include <cuda.h>
+#include "ApexCutil.h"
+#include "vector_types.h"
+#include "ApexMirroredArray.h"
+#include "InplaceStorage.h"
+#include "PsMutex.h"
+#include "ApexCudaTest.h"
+#include "ApexCudaProfile.h"
+#include "ApexCudaDefs.h"
+
+namespace nvidia
+{
+namespace apex
+{
+
+struct DimGrid
+{
+	uint32_t x, y;
+
+	DimGrid() {}
+	DimGrid(uint32_t x, uint32_t y = 1)
+	{
+		this->x = x;
+		this->y = y;
+	}
+};
+struct DimBlock
+{
+	uint32_t x, y, z;
+
+	DimBlock() {}
+	DimBlock(uint32_t x, uint32_t y = 1, uint32_t z = 1)
+	{
+		this->x = x;
+		this->y = y;
+		this->z = z;
+	}
+};
+
+struct ApexKernelConfig
+{
+	uint32_t fixedSharedMemDWords;
+	uint32_t sharedMemDWordsPerWarp;
+	DimBlock blockDim;
+	uint32_t minWarpsPerBlock;
+	uint32_t maxGridSize;
+
+	ApexKernelConfig() { fixedSharedMemDWords = sharedMemDWordsPerWarp = 0; blockDim = DimBlock(0, 0, 0); minWarpsPerBlock = 1; maxGridSize = MAX_BOUND_BLOCKS; }
+	ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, int fixedWarpsPerBlock = 0, uint32_t minWarpsPerBlock = 1, uint32_t maxGridSize = MAX_BOUND_BLOCKS)
+	{
+		this->fixedSharedMemDWords = fixedSharedMemDWords;
+		this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp;
+		this->blockDim = DimBlock(fixedWarpsPerBlock * WARP_SIZE);
+		this->minWarpsPerBlock = minWarpsPerBlock;
+		this->maxGridSize = maxGridSize;
+	}
+	ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, const DimBlock& blockDim)
+	{
+		this->fixedSharedMemDWords = fixedSharedMemDWords;
+		this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp;
+		this->blockDim = blockDim;
+		this->minWarpsPerBlock = 1;
+		this->maxGridSize = MAX_BOUND_BLOCKS;
+	}
+};
+
+struct ApexCudaMemRefBase
+{
+	typedef ApexCudaMemFlags::Enum Intent;
+
+	const void*	ptr;
+	size_t		size;	//size in bytes
+	int32_t		offset;	//data offset for ptr
+	Intent		intent;
+	
+	ApexCudaMemRefBase(const void* ptr, size_t byteSize, int32_t offset, Intent intent)
+		: ptr(ptr), size(byteSize), offset(offset), intent(intent) {}
+	virtual ~ApexCudaMemRefBase() {}
+};
+
+template <class T>
+struct ApexCudaMemRef : public ApexCudaMemRefBase
+{
+	ApexCudaMemRef(T* ptr, size_t byteSize, Intent intent = ApexCudaMemFlags::IN_OUT) 
+		: ApexCudaMemRefBase(ptr, byteSize, 0, intent) {}
+
+	ApexCudaMemRef(T* ptr, size_t byteSize, int32_t offset, Intent intent) 
+		: ApexCudaMemRefBase(ptr, byteSize, offset, intent) {}
+
+	inline T* getPtr() const
+	{
+		return (T*)ptr;
+	}
+
+	virtual ~ApexCudaMemRef() {}
+};
+
+template <class T>
+inline ApexCudaMemRef<T> createApexCudaMemRef(T* ptr, size_t size, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT)
+{
+	return ApexCudaMemRef<T>(ptr, sizeof(T) * size, intent);
+}
+
+template <class T>
+inline ApexCudaMemRef<T> createApexCudaMemRef(T* ptr, size_t size, int32_t offset, ApexCudaMemRefBase::Intent intent)
+{
+	return ApexCudaMemRef<T>(ptr, sizeof(T) * size, sizeof(T) * offset, intent);
+}
+
+template <class T>
+inline ApexCudaMemRef<T> createApexCudaMemRef(const ApexMirroredArray<T>& ma, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT)
+{
+	return ApexCudaMemRef<T>(ma.getGpuPtr(), ma.getByteSize(), intent);
+}
+
+template <class T>
+inline ApexCudaMemRef<T> createApexCudaMemRef(const ApexMirroredArray<T>& ma, size_t size, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT)
+{
+	return ApexCudaMemRef<T>(ma.getGpuPtr(), sizeof(T) * size, intent);
+}
+
+template <class T>
+inline ApexCudaMemRef<T> createApexCudaMemRef(const ApexMirroredArray<T>& ma, size_t size, int32_t offset, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT)
+{
+	return ApexCudaMemRef<T>(ma.getGpuPtr(), sizeof(T) * size, sizeof(T) * offset, intent);
+}
+
+#ifndef ALIGN_OFFSET
+#define ALIGN_OFFSET(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
+#endif
+
+#define CUDA_MAX_PARAM_SIZE		256
+
+
+class ApexCudaTestKernelContext;
+
+
+class ApexCudaConstStorage;
+
+class ApexCudaModule
+{
+public:
+	ApexCudaModule()
+		: mCuModule(0), mStorage(0)
+	{
+	}
+
+	PX_INLINE void init(const void* image)
+	{
+		if (mCuModule == 0)
+		{
+			CUT_SAFE_CALL(cuModuleLoadDataEx(&mCuModule, image, 0, NULL, NULL));
+		}
+	}
+	PX_INLINE void release()
+	{
+		if (mCuModule != 0)
+		{
+			CUT_SAFE_CALL(cuModuleUnload(mCuModule));
+			mCuModule = 0;
+		}
+	}
+
+	PX_INLINE bool isValid() const
+	{
+		return (mCuModule != 0);
+	}
+
+	PX_INLINE CUmodule getCuModule() const
+	{
+		return mCuModule;
+	}
+
+	PX_INLINE ApexCudaConstStorage* getStorage() const
+	{
+		return mStorage;
+	}
+
+private:
+	CUmodule mCuModule;
+	ApexCudaConstStorage* mStorage;
+
+	friend class ApexCudaConstStorage;
+};
+
+class ApexCudaObjManager;
+
+class ApexCudaObj
+{
+	friend class ApexCudaObjManager;
+	ApexCudaObj* mObjListNext;
+
+protected:
+	const char* mName;
+	ApexCudaModule* mCudaModule;
+	ApexCudaObjManager* mManager;
+
+	ApexCudaObj(const char* name) : mObjListNext(0), mName(name), mCudaModule(NULL), mManager(NULL) {}
+	virtual ~ApexCudaObj() {}
+
+	PX_INLINE void init(ApexCudaObjManager* manager, ApexCudaModule* cudaModule);
+
+public:
+	const char* getName() const
+	{
+		return mName;
+	}
+	const ApexCudaModule* getCudaModule() const
+	{
+		return mCudaModule;
+	}
+
+	enum ApexCudaObjType
+	{
+		UNKNOWN,
+		FUNCTION,
+		TEXTURE,
+		CONST_STORAGE,
+		SURFACE
+	};
+	virtual ApexCudaObjType getType()
+	{
+		return UNKNOWN;
+	}
+
+	PX_INLINE ApexCudaObj* next()
+	{
+		return mObjListNext;
+	}
+	virtual void release() = 0;	
+	virtual void formContext(ApexCudaTestKernelContext*) = 0;
+};
+
+struct ApexCudaDeviceTraits
+{
+	uint32_t mMaxSharedMemPerBlock;
+	uint32_t mMaxSharedMemPerSM;
+	uint32_t mMaxRegistersPerSM;
+	uint32_t mMaxThreadsPerSM;
+
+	uint32_t mBlocksPerSM;
+	uint32_t mBlocksPerSM_2D;
+	uint32_t mBlocksPerSM_3D;
+	uint32_t mMaxBlocksPerGrid;
+};
+
+class ApexCudaObjManager
+{
+	ApexCudaObj* mObjListHead;
+
+	Module*					mNxModule;
+	ApexCudaTestManager*		mCudaTestManager;	
+	PxGpuDispatcher*			mGpuDispatcher;
+
+	ApexCudaDeviceTraits		mDeviceTraits;
+
+protected:
+	friend class ApexCudaFunc;
+	ApexCudaProfileSession*		mCudaProfileSession;
+
+public:
+	ApexCudaObjManager() : mObjListHead(0), mNxModule(0), mCudaTestManager(0), mGpuDispatcher(0), mCudaProfileSession(0) {}
+
+	void init(Module*	nxModule, ApexCudaTestManager* cudaTestManager, PxGpuDispatcher*	gpuDispatcher)
+	{
+		mNxModule = nxModule;
+		mCudaTestManager = cudaTestManager;
+		mGpuDispatcher = gpuDispatcher;
+
+		//get device traits
+		CUdevice device;
+		CUT_SAFE_CALL(cuCtxGetDevice(&device));
+		CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxSharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device));
+		CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxSharedMemPerSM, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device));
+		CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxRegistersPerSM, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device));
+		CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxThreadsPerSM, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device));
+
+#ifdef APEX_CUDA_FORCED_BLOCKS
+		mDeviceTraits.mBlocksPerSM = (APEX_CUDA_FORCED_BLOCKS > 32) ? 2u : 1u;
+		mDeviceTraits.mMaxBlocksPerGrid = APEX_CUDA_FORCED_BLOCKS;
+#else
+		int computeMajor;
+		int smCount;
+		CUT_SAFE_CALL(cuDeviceGetAttribute(&smCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+		CUT_SAFE_CALL(cuDeviceGetAttribute(&computeMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+
+		mDeviceTraits.mBlocksPerSM = 2;//(computeMajor >= 5) ? 2u : 1u;
+		mDeviceTraits.mMaxBlocksPerGrid = uint32_t(smCount) * mDeviceTraits.mBlocksPerSM;
+#endif
+		mDeviceTraits.mBlocksPerSM_2D = 4;
+		mDeviceTraits.mBlocksPerSM_3D = 4;
+	}
+
+	PX_INLINE const ApexCudaDeviceTraits& getDeviceTraits() const
+	{
+		return mDeviceTraits;
+	}
+
+	PX_INLINE void addToObjList(ApexCudaObj* obj)
+	{
+		obj->mObjListNext = mObjListHead;
+		mObjListHead = obj;
+	}
+
+	PX_INLINE ApexCudaObj* getObjListHead()
+	{
+		return mObjListHead;
+	}
+
+	void releaseAll()
+	{
+		for (ApexCudaObj* obj = mObjListHead; obj != 0; obj = obj->mObjListNext)
+		{
+			obj->release();
+		}
+	}
+
+	PX_INLINE Module* getModule() const
+	{
+		return mNxModule;
+	}
+	PX_INLINE ApexCudaTestManager* getCudaTestManager() const
+	{
+		return mCudaTestManager;
+	}
+	PX_INLINE PxGpuDispatcher* getGpuDispatcher() const
+	{
+		return mGpuDispatcher;
+	}
+
+public:
+	virtual void onBeforeLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) = 0;
+	virtual void onAfterLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) = 0;
+
+};
+
+PX_INLINE void ApexCudaObj::init(ApexCudaObjManager* manager, ApexCudaModule* cudaModule)
+{
+	mManager = manager;
+	mManager->addToObjList(this);
+	mCudaModule = cudaModule;
+}
+
+
+class ApexCudaTexRef : public ApexCudaObj
+{
+public:
+	void init(ApexCudaObjManager* manager, CUtexref texRef, ApexCudaModule* cudaModule, CUarray_format format, int numChannels, int dim, int flags)
+	{
+		ApexCudaObj::init(manager, cudaModule);
+
+		mTexRef = texRef;
+		mDim = dim;
+		mFormat = format;
+		mNumChannels = numChannels;
+		mFlags = flags;
+		mIsBinded = false;
+
+		CUT_SAFE_CALL(cuTexRefSetFilterMode(mTexRef, mFilterMode));
+
+		for (int d = 0; d < dim; ++d)
+		{
+			CUT_SAFE_CALL(cuTexRefSetAddressMode(mTexRef, d, CU_TR_ADDRESS_MODE_CLAMP));
+		}
+	}
+
+	ApexCudaTexRef(const char* name, CUfilter_mode filterMode = CU_TR_FILTER_MODE_POINT)
+		: ApexCudaObj(name), mTexRef(0), mFilterMode(filterMode)
+	{
+	}
+
+	void setNormalizedCoords()
+	{
+		mFlags |= CU_TRSF_NORMALIZED_COORDINATES;
+	}
+
+	void bindTo(const void* ptr, size_t bytes, size_t* retByteOffset = 0)
+	{
+		CUT_SAFE_CALL(cuTexRefSetFormat(mTexRef, mFormat, mNumChannels));
+		CUT_SAFE_CALL(cuTexRefSetFlags(mTexRef, (uint32_t)mFlags));
+
+		size_t byteOffset;
+		CUT_SAFE_CALL(cuTexRefSetAddress(&byteOffset, mTexRef, CUT_TODEVICE(ptr), static_cast<unsigned int>(bytes)));
+
+		if (retByteOffset != 0)
+		{
+			*retByteOffset = byteOffset;
+		}
+		else
+		{
+			PX_ASSERT(byteOffset == 0);
+		}
+
+		mBindedSize = bytes;
+		mBindedPtr = ptr;
+		mBindedArray = NULL;
+		mIsBinded = true;
+	}
+
+	template <typename T>
+	void bindTo(ApexMirroredArray<T>& mem, size_t* retByteOffset = 0)
+	{
+		bindTo(mem.getGpuPtr(), mem.getByteSize(), retByteOffset);
+	}
+
+	template <typename T>
+	void bindTo(ApexMirroredArray<T>& mem, size_t size, size_t* retByteOffset = 0)
+	{
+		bindTo(mem.getGpuPtr(), sizeof(T) * size, retByteOffset);
+	}
+
+	void bindTo(CUarray cuArray)
+	{
+		CUT_SAFE_CALL(cuTexRefSetFlags(mTexRef, (uint32_t)mFlags));
+
+		CUT_SAFE_CALL(cuTexRefSetArray(mTexRef, cuArray, CU_TRSA_OVERRIDE_FORMAT));
+
+		mBindedSize = 0;
+		mBindedPtr = NULL;
+		mBindedArray = cuArray;
+		mIsBinded = true;
+	}
+
+	void bindTo(const ApexCudaArray& cudaArray)
+	{
+		bindTo(cudaArray.getCuArray());
+	}
+
+	void unbind()
+	{
+		size_t byteOffset;
+		CUT_SAFE_CALL(cuTexRefSetAddress(&byteOffset, mTexRef, CUdeviceptr(0), 0));
+		mIsBinded = false;
+	}
+	
+	virtual ApexCudaObjType getType()
+	{
+		return TEXTURE;
+	}
+
+	virtual void release() {}
+	
+	virtual void formContext(ApexCudaTestKernelContext* context) 
+	{
+		if (mIsBinded)
+		{
+			context->addTexRef(mName, mBindedPtr, mBindedSize, mBindedArray);
+		}
+	}
+
+private:
+	CUtexref	mTexRef;
+	CUfilter_mode mFilterMode;
+
+	CUarray_format mFormat;
+	int			mNumChannels;
+	int			mDim;
+	int			mFlags;
+	
+	bool		mIsBinded;
+	size_t		mBindedSize;
+	const void*	mBindedPtr;
+	CUarray		mBindedArray;
+};
+
+
+class ApexCudaSurfRef : public ApexCudaObj
+{
+public:
+	void init(ApexCudaObjManager* manager, CUsurfref surfRef, ApexCudaModule* cudaModule)
+	{
+		ApexCudaObj::init(manager, cudaModule);
+
+		mSurfRef = surfRef;
+
+		mIsBinded = false;
+	}
+
+	ApexCudaSurfRef(const char* name) : ApexCudaObj(name), mSurfRef(0)
+	{
+	}
+
+	void bindTo(CUarray cuArray, ApexCudaMemFlags::Enum flags)
+	{
+		CUDA_ARRAY3D_DESCRIPTOR desc;
+		CUT_SAFE_CALL(cuArray3DGetDescriptor(&desc, cuArray));
+
+		CUT_SAFE_CALL(cuSurfRefSetArray(mSurfRef, cuArray, 0));
+
+		mIsBinded = true;
+		mBindedArray = cuArray;
+		mBindedFlags = flags;
+	}
+
+	void bindTo(const ApexCudaArray& cudaArray, ApexCudaMemFlags::Enum flags)
+	{
+		bindTo(cudaArray.getCuArray(), flags);
+	}
+
+	void unbind()
+	{
+		mIsBinded = false;
+	}
+
+	virtual ApexCudaObjType getType()
+	{
+		return SURFACE;
+	}
+
+	virtual void release() {}
+
+	virtual void formContext(ApexCudaTestKernelContext* context)
+	{
+		if (mIsBinded)
+		{
+			context->addSurfRef(mName, mBindedArray, mBindedFlags);
+		}
+	}
+
+private:
+	CUsurfref   mSurfRef;
+
+	bool		mIsBinded;
+	CUarray		mBindedArray;
+	ApexCudaMemFlags::Enum mBindedFlags;
+};
+
+class ApexCudaTexRefScopeBind
+{
+private:
+	ApexCudaTexRefScopeBind& operator=(const ApexCudaTexRefScopeBind&);
+	ApexCudaTexRef& mTexRef;
+
+public:
+	ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, void* ptr, size_t bytes, size_t* retByteOffset = 0)
+		: mTexRef(texRef)
+	{
+		mTexRef.bindTo(ptr, bytes, retByteOffset);
+	}
+	template <typename T>
+	ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, ApexMirroredArray<T>& mem, size_t* retByteOffset = 0)
+		: mTexRef(texRef)
+	{
+		mTexRef.bindTo(mem, retByteOffset);
+	}
+	template <typename T>
+	ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, ApexMirroredArray<T>& mem, size_t size, size_t* retByteOffset = 0)
+		: mTexRef(texRef)
+	{
+		mTexRef.bindTo(mem, size, retByteOffset);
+	}
+	ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, const ApexCudaArray& cudaArray)
+		: mTexRef(texRef)
+	{
+		mTexRef.bindTo(cudaArray);
+	}
+	~ApexCudaTexRefScopeBind()
+	{
+		mTexRef.unbind();
+	}
+};
+
+#define APEX_CUDA_TEXTURE_SCOPE_BIND(texRef, mem) ApexCudaTexRefScopeBind texRefScopeBind_##texRef (CUDA_OBJ(texRef), mem);
+#define APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRef, mem, size) ApexCudaTexRefScopeBind texRefScopeBind_##texRef (CUDA_OBJ(texRef), mem, size);
+#define APEX_CUDA_TEXTURE_SCOPE_BIND_PTR(texRef, ptr, count) ApexCudaTexRefScopeBind texRefScopeBind_##texRef (CUDA_OBJ(texRef), ptr, sizeof(*ptr) * count);
+#define APEX_CUDA_TEXTURE_BIND(texRef, mem) CUDA_OBJ(texRef).bindTo(mem);
+#define APEX_CUDA_TEXTURE_BIND_PTR(texRef, ptr, count) CUDA_OBJ(texRef).bindTo(ptr, sizeof(*ptr) * count);
+#define APEX_CUDA_TEXTURE_UNBIND(texRef) CUDA_OBJ(texRef).unbind();
+
+
+class ApexCudaSurfRefScopeBind
+{
+private:
+	ApexCudaSurfRefScopeBind& operator=(const ApexCudaSurfRefScopeBind&);
+	ApexCudaSurfRef& mSurfRef;
+
+public:
+	ApexCudaSurfRefScopeBind(ApexCudaSurfRef& surfRef, ApexCudaArray& cudaArray, ApexCudaMemFlags::Enum flags)
+		: mSurfRef(surfRef)
+	{
+		mSurfRef.bindTo(cudaArray, flags);
+	}
+	ApexCudaSurfRefScopeBind(ApexCudaSurfRef& surfRef, CUarray cuArray, ApexCudaMemFlags::Enum flags)
+		: mSurfRef(surfRef)
+	{
+		mSurfRef.bindTo(cuArray, flags);
+	}
+	~ApexCudaSurfRefScopeBind()
+	{
+		mSurfRef.unbind();
+	}
+};
+
+#define APEX_CUDA_SURFACE_SCOPE_BIND(surfRef, mem, flags) ApexCudaSurfRefScopeBind surfRefScopeBind_##surfRef (CUDA_OBJ(surfRef), mem, flags);
+#define APEX_CUDA_SURFACE_BIND(surfRef, mem, flags) CUDA_OBJ(surfRef).bindTo(mem, flags);
+#define APEX_CUDA_SURFACE_UNBIND(surfRef) CUDA_OBJ(surfRef).unbind();
+
+
+class ApexCudaVar : public ApexCudaObj
+{
+public:
+	size_t getSize() const
+	{
+		return mSize;
+	}
+
+	void init(ApexCudaObjManager* manager, ApexCudaModule* cudaModule, CUdeviceptr devPtr, size_t size, PxCudaContextManager* ctx)
+	{
+		ApexCudaObj::init(manager, cudaModule);
+
+		mDevPtr = devPtr;
+		mSize = size;
+		init(manager, ctx);
+	}
+
+	virtual void release() {}
+	virtual void formContext(ApexCudaTestKernelContext*) {}
+
+protected:
+	virtual void init(ApexCudaObjManager* , PxCudaContextManager*) = 0;
+
+	ApexCudaVar(const char* name) : ApexCudaObj(name), mDevPtr(0), mSize(0)
+	{
+	}
+
+protected:
+	CUdeviceptr		mDevPtr;
+	size_t			mSize;
+};
+
+
+class ApexCudaConstStorage : public ApexCudaVar, public InplaceStorage
+{
+public:
+	ApexCudaConstStorage(const char* nameVar, const char* nameTexRef)
+		: ApexCudaVar(nameVar), mCudaTexRef(nameTexRef), mStoreInTexture(false)
+	{
+		mStorageSize = 0;
+		mStoragePtr = 0;
+
+		mHostBuffer = 0;
+		mDeviceBuffer = 0;
+	}
+
+	virtual ApexCudaObjType getType()
+	{
+		return CONST_STORAGE;
+	}
+
+	virtual void formContext(ApexCudaTestKernelContext* context) 
+	{
+		if (!mStoreInTexture && mHostBuffer != 0)
+		{
+			PX_ASSERT(mHostBuffer->getSize() >= ApexCudaVar::getSize());
+			void* hostPtr = reinterpret_cast<void*>(mHostBuffer->getPtr());
+			context->addConstMem(mName, hostPtr, ApexCudaVar::getSize());
+		}
+	}
+
+	virtual void init(ApexCudaObjManager* manager, PxCudaContextManager* ctx)
+	{
+		PX_ASSERT(mCudaModule != 0);
+		PX_ASSERT(mCudaModule->mStorage == 0);
+		mCudaModule->mStorage = this;
+
+		CUtexref cuTexRef;
+		CUT_SAFE_CALL(cuModuleGetTexRef(&cuTexRef, mCudaModule->getCuModule(), mCudaTexRef.getName()));
+
+		mCudaTexRef.init(manager, cuTexRef, mCudaModule, CU_AD_FORMAT_SIGNED_INT32, 1, 1, CU_TRSF_READ_AS_INTEGER);
+
+		//prealloc. host buffer for Apex Cuda Test framework
+		reallocHostBuffer(ctx, ApexCudaVar::getSize());
+	}
+
+	virtual void release()
+	{
+		InplaceStorage::release();
+
+		if (mDeviceBuffer != 0)
+		{
+			mDeviceBuffer->free();
+			mDeviceBuffer = 0;
+		}
+		if (mHostBuffer != 0)
+		{
+			mHostBuffer->free();
+			mHostBuffer = 0;
+		}
+
+		if (mStoragePtr != 0)
+		{
+			getAllocator().deallocate(mStoragePtr);
+			mStoragePtr = 0;
+			mStorageSize = 0;
+		}
+	}
+
+	bool copyToDevice(PxCudaContextManager* ctx, CUstream stream)
+	{
+		if (mStoragePtr == 0)
+		{
+			return false;
+		}
+
+		bool result = false;
+
+		InplaceStorage* storage = static_cast<InplaceStorage*>(this);
+		mMutex.lock();
+		if (storage->isChanged())
+		{
+			if (!reallocHostBuffer(ctx, mStorageSize))
+			{
+				return false;
+			}
+
+			CUdeviceptr copyDevPtr = 0;
+			if (mStoreInTexture)
+			{
+				if (mDeviceBuffer == 0)
+				{
+					mDeviceBuffer = ctx->getMemoryManager()->alloc(
+						PxCudaBufferType(PxCudaBufferMemorySpace::T_GPU, PxCudaBufferFlags::F_READ_WRITE),
+						mStorageSize);
+					if (mDeviceBuffer == 0)
+					{
+						APEX_INTERNAL_ERROR("ApexCudaConstStorage failed to allocate GPU Memory!");
+						return false;
+					}
+				}
+				else if (mDeviceBuffer->getSize() < mStorageSize)
+				{
+					mDeviceBuffer->realloc(mStorageSize);
+				}
+				copyDevPtr = mDeviceBuffer->getPtr();
+			}
+			else
+			{
+				if (mDeviceBuffer != 0)
+				{
+					mDeviceBuffer->free();
+					mDeviceBuffer = 0;
+				}
+				copyDevPtr = mDevPtr;
+			}
+
+			uint8_t* hostPtr = reinterpret_cast<uint8_t*>(mHostBuffer->getPtr());
+
+			size_t size = storage->mapTo(hostPtr);
+			// padding up to the next dword
+			size = (size + 7) & ~7;
+			if (size > mStorageSize) size = mStorageSize;
+
+			CUT_SAFE_CALL(cuMemcpyHtoDAsync(copyDevPtr, hostPtr, size, stream));
+
+			storage->setUnchanged();
+			result = true;
+		}
+		mMutex.unlock();
+
+		return result;
+	}
+
+	PX_INLINE bool getStoreInTexture() const
+	{
+		return mStoreInTexture;
+	}
+
+	PX_INLINE void onBeforeLaunch()
+	{
+		if (mStoreInTexture)
+		{
+			mCudaTexRef.bindTo( mDeviceBuffer ? reinterpret_cast<void*>(mDeviceBuffer->getPtr()) : 0, mStorageSize );
+		}
+	}
+
+	PX_INLINE void onAfterLaunch()
+	{
+		if (mStoreInTexture)
+		{
+			mCudaTexRef.unbind();
+		}
+	}
+
+protected:
+	bool reallocHostBuffer(PxCudaContextManager* ctx, size_t size)
+	{
+		if (mHostBuffer == 0)
+		{
+			mHostBuffer = ctx->getMemoryManager()->alloc(
+				PxCudaBufferType(PxCudaBufferMemorySpace::T_PINNED_HOST, PxCudaBufferFlags::F_READ_WRITE),
+				size);
+			if (mHostBuffer == 0)
+			{
+				APEX_INTERNAL_ERROR("ApexCudaConstStorage failed to allocate Pinned Host Memory!");
+				return false;
+			}
+		}
+		else if (mHostBuffer->getSize() < size)
+		{
+			mHostBuffer->realloc(size);
+		}
+		return true;
+	}
+
+	virtual uint8_t* storageResizeBuffer(uint32_t newSize)
+	{
+		if (!mStoreInTexture && newSize > ApexCudaVar::getSize())
+		{
+#if 0
+			APEX_INTERNAL_ERROR("Out of CUDA constant memory");
+			PX_ALWAYS_ASSERT();
+			return 0;
+#else
+			//switch to texture
+			mStoreInTexture = true;
+#endif
+		}
+		else if (mStoreInTexture && newSize <= ApexCudaVar::getSize())
+		{
+			//switch back to const mem.
+			mStoreInTexture = false;
+		}
+
+		const uint32_t PageSize = 4096;
+		size_t allocSize = mStoreInTexture ? (newSize + (PageSize - 1)) & ~(PageSize - 1) : ApexCudaVar::getSize();
+
+		if (allocSize > mStorageSize)
+		{
+			uint8_t* allocStoragePtr = static_cast<uint8_t*>(getAllocator().allocate(allocSize, "ApexCudaConstStorage", __FILE__, __LINE__));
+			if (allocStoragePtr == 0)
+			{
+				APEX_INTERNAL_ERROR("ApexCudaConstStorage failed to allocate memory!");
+				return 0;
+			}
+			if (mStoragePtr != 0)
+			{
+				memcpy(allocStoragePtr, mStoragePtr, mStorageSize);
+				getAllocator().deallocate(mStoragePtr);
+			}
+			mStorageSize = allocSize;
+			mStoragePtr = allocStoragePtr;
+		}
+		return mStoragePtr;
+	}
+
+	virtual void storageLock()
+	{
+		mMutex.lock();
+	}
+	virtual void storageUnlock()
+	{
+		mMutex.unlock();
+	}
+
+private:
+	bool                    mStoreInTexture;
+	ApexCudaTexRef          mCudaTexRef;
+
+	size_t                  mStorageSize;
+	uint8_t*            mStoragePtr;
+
+	PxCudaBuffer*    mHostBuffer;
+	PxCudaBuffer*    mDeviceBuffer;
+
+	nvidia::Mutex            mMutex;
+
+	friend class ApexCudaTestKernelContextReader;
+};
+
+typedef InplaceStorageGroup ApexCudaConstMemGroup;
+
+#define APEX_CUDA_CONST_MEM_GROUP_SCOPE(group) INPLACE_STORAGE_GROUP_SCOPE(group)
+
+
+
+struct ApexCudaFuncParams
+{
+	int		mOffset;
+	char	mParams[CUDA_MAX_PARAM_SIZE];
+
+	ApexCudaFuncParams() : mOffset(0) {}
+
+
+};
+
+class ApexCudaFunc : public ApexCudaObj
+{
+public:
+	PX_INLINE bool testNameMatch(const char* name) const
+	{
+		if (const char* name$ = strrchr(name, '$'))
+		{
+			if (const char* name_ = strrchr(name, '_'))
+			{
+				return (nvidia::strncmp(name, mName, (uint32_t)(name_ - name)) == 0);
+			}
+		}
+		return (nvidia::strcmp(name, mName) == 0);
+	}
+
+	void init(ApexCudaObjManager* manager, const char* name, CUfunction cuFunc, ApexCudaModule* cudaModule)
+	{
+		int funcInstIndex = 0;
+		if (const char* name$ = strrchr(name, '$'))
+		{
+			funcInstIndex = atoi(name$ + 1);
+		}
+		if (funcInstIndex >= MAX_INST_COUNT)
+		{
+			PX_ALWAYS_ASSERT();
+			return;
+		}
+
+		if (mFuncInstCount == 0)
+		{
+			ApexCudaObj::init(manager, cudaModule);
+		}
+
+		PxCudaContextManager* ctx = mManager->mGpuDispatcher->getCudaContextManager();
+		{
+			int funcMaxThreadsPerBlock;
+			cuFuncGetAttribute(&funcMaxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFunc);
+
+			int funcNumRegsPerThread;
+			cuFuncGetAttribute(&funcNumRegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, cuFunc);
+
+			int funcSharedMemSize;
+			cuFuncGetAttribute(&funcSharedMemSize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cuFunc);
+			const int sharedMemGranularity = (ctx->supportsArchSM20() ? 128 : 512) - 1;
+			funcSharedMemSize = (funcSharedMemSize + sharedMemGranularity) & ~sharedMemGranularity;
+
+			FuncInstData& fid = mFuncInstData[funcInstIndex];
+			fid.mName = name;
+			fid.mCuFunc = cuFunc;
+			fid.mMaxThreadsPerBlock = (uint32_t)funcMaxThreadsPerBlock;
+
+			fid.mNumRegsPerThread = (uint32_t)funcNumRegsPerThread;
+			fid.mStaticSharedSize = (uint32_t)funcSharedMemSize;
+			PX_ASSERT(fid.mStaticSharedSize <= mManager->getDeviceTraits().mMaxSharedMemPerBlock);
+
+			fid.mWarpsPerBlock = 0;
+			fid.mDynamicShared = 0;
+		}
+
+		init(ctx, funcInstIndex);
+		mFuncInstCount = PxMax(mFuncInstCount, uint32_t(funcInstIndex) + 1);
+	}
+
+	virtual ApexCudaObjType getType()
+	{
+		return FUNCTION;
+	}
+	virtual void release() {}
+
+	virtual void formContext(ApexCudaTestKernelContext*) {}
+
+	/** This function force cuda stream syncronization that may slowdown application
+	 */
+	PX_INLINE void setProfileSession(ApexCudaProfileSession* cudaProfileSession)
+	{
+		mManager->mCudaProfileSession = cudaProfileSession;
+		mProfileId = cudaProfileSession ? cudaProfileSession->getProfileId(mName, mManager->mNxModule->getName()) : 0;
+	}
+
+	PX_INLINE uint32_t getProfileId() const
+	{
+		return mProfileId;
+	}
+
+protected:
+	static const int MAX_INST_COUNT = 2;
+
+	struct FuncInstData
+	{
+		const char* mName;
+		CUfunction  mCuFunc;
+
+		uint32_t		mMaxThreadsPerBlock;
+		uint32_t		mNumRegsPerThread;
+		uint32_t		mStaticSharedSize;
+
+		uint32_t		mWarpsPerBlock;
+		uint32_t		mDynamicShared;
+	};
+
+	uint32_t			mFuncInstCount;
+	FuncInstData	mFuncInstData[MAX_INST_COUNT];
+
+	uint32_t			mProfileId;
+	ApexCudaTestKernelContext*	mCTContext;
+
+	ApexCudaFunc(const char* name) 
+		: ApexCudaObj(name), mFuncInstCount(0), mProfileId(0), mCTContext(0)
+	{
+	}
+	virtual void init(PxCudaContextManager* , int /*funcInstIndex*/) {}
+
+	bool isValid() const
+	{
+		return (mFuncInstCount != 0) && (mCudaModule != 0);
+	}
+
+	const FuncInstData& getFuncInstData() const
+	{
+		PX_ASSERT(isValid());
+
+		ApexCudaConstStorage* storage = mCudaModule->getStorage();
+		if (storage != 0 && mFuncInstCount > 1)
+		{
+			PX_ASSERT(mFuncInstCount == 2);
+			return mFuncInstData[ storage->getStoreInTexture() ? 1 : 0 ];
+		}
+		else
+		{
+			PX_ASSERT(mFuncInstCount == 1);
+			return mFuncInstData[0];
+		}
+	}
+
+	PX_INLINE void onBeforeLaunch(CUstream stream)
+	{
+		if (ApexCudaConstStorage* storage = mCudaModule->getStorage())
+		{
+			storage->onBeforeLaunch();
+		}
+
+		mManager->onBeforeLaunchApexCudaFunc(*this, stream);
+	}
+	PX_INLINE void onAfterLaunch(CUstream stream)
+	{
+		mManager->onAfterLaunchApexCudaFunc(*this, stream);
+
+		if (ApexCudaConstStorage* storage = mCudaModule->getStorage())
+		{
+			storage->onAfterLaunch();
+		}
+	}
+
+	template <typename T>
+	void setParam(ApexCudaFuncParams& params, T* ptr)
+	{
+		ALIGN_OFFSET(params.mOffset, (int)__alignof(ptr));
+		PX_ASSERT(params.mOffset + sizeof(ptr) <= CUDA_MAX_PARAM_SIZE);
+		memcpy(params.mParams + params.mOffset, &ptr, sizeof(ptr));
+		params.mOffset += sizeof(ptr);
+		mCTContext = NULL;	// context can't catch pointers, use instead ApexCudaMemRef
+	}
+
+	template <typename T>
+	void setParam(ApexCudaFuncParams& params, const ApexCudaMemRef<T>& memRef)
+	{
+		T* ptr = memRef.getPtr();
+		ALIGN_OFFSET(params.mOffset, (int)__alignof(ptr));
+		PX_ASSERT(params.mOffset + sizeof(ptr) <= CUDA_MAX_PARAM_SIZE);
+		memcpy(params.mParams + params.mOffset, &ptr, sizeof(ptr));
+		params.mOffset += sizeof(ptr);
+	}
+
+	template <typename T>
+	void setParam(ApexCudaFuncParams& params, const T& val)
+	{
+		ALIGN_OFFSET(params.mOffset, (int)__alignof(val));
+		PX_ASSERT(params.mOffset + sizeof(val) <= CUDA_MAX_PARAM_SIZE);
+		memcpy(params.mParams + params.mOffset, (void*)&val, sizeof(val));
+		params.mOffset += sizeof(val);
+	}
+
+	void resolveContext()
+	{
+		mCTContext->startObjList();
+		ApexCudaObj* obj = mManager->getObjListHead();
+		while(obj)
+		{
+			if ((CUmodule)obj->getCudaModule()->getCuModule() == mCudaModule->getCuModule())
+			{
+				obj->formContext(mCTContext);
+			}
+			obj = obj->next();
+		}
+		mCTContext->finishObjList();
+	}
+
+	template <typename T>
+	void copyParam(const char* name, const ApexCudaMemRef<T>& memRef)
+	{
+		mCTContext->addParam(name, __alignof(void*), memRef.ptr, memRef.size, memRef.intent, memRef.offset);
+	}
+
+	template <typename T>
+	void copyParam(const char* name, const T& val)
+	{
+		mCTContext->addParam(name, __alignof(val), (void*)&val, sizeof(val));
+	}
+
+private:
+	template <typename T>
+	void copyParam(const char* name, const ApexCudaMemRef<T>& memRef, uint32_t fpType)
+	{
+		mCTContext->addParam(name, __alignof(void*), memRef.ptr, memRef.size, memRef.intent, memRef.offset, fpType);
+	}
+	void setParam(ApexCudaFuncParams& params, unsigned align, unsigned size, void* ptr)
+	{
+		ALIGN_OFFSET(params.mOffset, (int)align);
+		PX_ASSERT(params.mOffset + size <= CUDA_MAX_PARAM_SIZE);
+		memcpy(params.mParams + params.mOffset, ptr, (uint32_t)size);
+		params.mOffset += size;
+	}	
+	friend class ApexCudaTestKernelContextReader;
+};
+
+template <>
+inline void ApexCudaFunc::copyParam<float>(const char* name, const ApexCudaMemRef<float>& memRef)
+{
+	copyParam(name, memRef, 4);
+}
+
+template <>
+inline void ApexCudaFunc::copyParam<float2>(const char* name, const ApexCudaMemRef<float2>& memRef)
+{
+	copyParam(name, memRef, 4);
+}
+
+template <>
+inline void ApexCudaFunc::copyParam<float3>(const char* name, const ApexCudaMemRef<float3>& memRef)
+{
+	copyParam(name, memRef, 4);
+}
+
+template <>
+inline void ApexCudaFunc::copyParam<float4>(const char* name, const ApexCudaMemRef<float4>& memRef)
+{
+	copyParam(name, memRef, 4);
+}
+
+template <>
+inline void ApexCudaFunc::copyParam<double>(const char* name, const ApexCudaMemRef<double>& memRef)
+{
+	copyParam(name, memRef, 8);
+}
+
+
+class ApexCudaTimer
+{
+public:
+	ApexCudaTimer()
+		:	mIsStarted(false)
+		,	mIsFinished(false)
+		,	mStart(NULL)
+		,	mFinish(NULL)
+	{
+	}
+	~ApexCudaTimer()
+	{
+		if (mStart != NULL)
+		{
+			CUT_SAFE_CALL(cuEventDestroy(mStart));
+		}
+		if (mFinish != NULL)
+		{
+			CUT_SAFE_CALL(cuEventDestroy(mFinish));
+		}
+	}
+	void init()
+	{
+		if (mStart == NULL)
+		{
+			CUT_SAFE_CALL(cuEventCreate(&mStart, CU_EVENT_DEFAULT));
+		}
+		if (mFinish == NULL)
+		{
+			CUT_SAFE_CALL(cuEventCreate(&mFinish, CU_EVENT_DEFAULT));
+		}
+	}
+
+	void onStart(CUstream stream)
+	{
+		if (mStart != NULL)
+		{
+			mIsStarted = true;
+			CUT_SAFE_CALL(cuEventRecord(mStart, stream));
+		}
+	}
+	void onFinish(CUstream stream)
+	{
+		if (mFinish != NULL && mIsStarted)
+		{
+			mIsFinished = true;
+			CUT_SAFE_CALL(cuEventRecord(mFinish, stream));
+		}
+	}
+
+	float getElapsedTime()
+	{
+		if (mIsStarted && mIsFinished)
+		{
+			mIsStarted = false;
+			mIsFinished = false;
+			CUT_SAFE_CALL(cuEventSynchronize(mStart));
+			CUT_SAFE_CALL(cuEventSynchronize(mFinish));
+			float time;
+			CUT_SAFE_CALL(cuEventElapsedTime(&time, mStart, mFinish));
+			return time;
+		}
+		else
+		{
+			return 0.0f;
+		}
+	}
+private:
+	CUevent mStart, mFinish;
+	bool mIsStarted;
+	bool mIsFinished;
+};
+
+}
+} // end namespace nvidia::apex
+
+#endif //__APEX_CUDA_WRAPPER_H__
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCudaWrapper.h
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip