diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/common/include/ApexCudaWrapper.h | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'APEX_1.4/common/include/ApexCudaWrapper.h')
| -rw-r--r-- | APEX_1.4/common/include/ApexCudaWrapper.h | 1232 |
1 files changed, 1232 insertions, 0 deletions
diff --git a/APEX_1.4/common/include/ApexCudaWrapper.h b/APEX_1.4/common/include/ApexCudaWrapper.h new file mode 100644 index 00000000..c455fbaf --- /dev/null +++ b/APEX_1.4/common/include/ApexCudaWrapper.h @@ -0,0 +1,1232 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + + +#ifndef __APEX_CUDA_WRAPPER_H__ +#define __APEX_CUDA_WRAPPER_H__ + +#include <cuda.h> +#include "ApexCutil.h" +#include "vector_types.h" +#include "ApexMirroredArray.h" +#include "InplaceStorage.h" +#include "PsMutex.h" +#include "ApexCudaTest.h" +#include "ApexCudaProfile.h" +#include "ApexCudaDefs.h" + +namespace nvidia +{ +namespace apex +{ + +struct DimGrid +{ + uint32_t x, y; + + DimGrid() {} + DimGrid(uint32_t x, uint32_t y = 1) + { + this->x = x; + this->y = y; + } +}; +struct DimBlock +{ + uint32_t x, y, z; + + DimBlock() {} + DimBlock(uint32_t x, uint32_t y = 1, uint32_t z = 1) + { + this->x = x; + this->y = y; + this->z = z; + } +}; + +struct ApexKernelConfig +{ + uint32_t fixedSharedMemDWords; + uint32_t sharedMemDWordsPerWarp; + DimBlock blockDim; + uint32_t minWarpsPerBlock; + uint32_t maxGridSize; + + ApexKernelConfig() { fixedSharedMemDWords = sharedMemDWordsPerWarp = 0; blockDim = DimBlock(0, 0, 0); minWarpsPerBlock = 1; maxGridSize = MAX_BOUND_BLOCKS; } + ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, int fixedWarpsPerBlock = 0, uint32_t minWarpsPerBlock = 1, uint32_t maxGridSize = MAX_BOUND_BLOCKS) + { + this->fixedSharedMemDWords = fixedSharedMemDWords; + this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp; + this->blockDim = DimBlock(fixedWarpsPerBlock * WARP_SIZE); + this->minWarpsPerBlock = minWarpsPerBlock; + this->maxGridSize = maxGridSize; + } + ApexKernelConfig(uint32_t fixedSharedMemDWords, uint32_t sharedMemDWordsPerWarp, const DimBlock& blockDim) + { + this->fixedSharedMemDWords = fixedSharedMemDWords; + this->sharedMemDWordsPerWarp = sharedMemDWordsPerWarp; + this->blockDim = blockDim; + this->minWarpsPerBlock = 1; + this->maxGridSize = MAX_BOUND_BLOCKS; + } +}; + +struct ApexCudaMemRefBase +{ + typedef ApexCudaMemFlags::Enum Intent; + + const void* ptr; + size_t size; //size in bytes + int32_t offset; //data offset for ptr + Intent intent; + + ApexCudaMemRefBase(const void* ptr, size_t byteSize, int32_t offset, Intent intent) + : ptr(ptr), size(byteSize), offset(offset), intent(intent) {} + virtual ~ApexCudaMemRefBase() {} +}; + +template <class T> +struct ApexCudaMemRef : public ApexCudaMemRefBase +{ + ApexCudaMemRef(T* ptr, size_t byteSize, Intent intent = ApexCudaMemFlags::IN_OUT) + : ApexCudaMemRefBase(ptr, byteSize, 0, intent) {} + + ApexCudaMemRef(T* ptr, size_t byteSize, int32_t offset, Intent intent) + : ApexCudaMemRefBase(ptr, byteSize, offset, intent) {} + + inline T* getPtr() const + { + return (T*)ptr; + } + + virtual ~ApexCudaMemRef() {} +}; + +template <class T> +inline ApexCudaMemRef<T> createApexCudaMemRef(T* ptr, size_t size, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT) +{ + return ApexCudaMemRef<T>(ptr, sizeof(T) * size, intent); +} + +template <class T> +inline ApexCudaMemRef<T> createApexCudaMemRef(T* ptr, size_t size, int32_t offset, ApexCudaMemRefBase::Intent intent) +{ + return ApexCudaMemRef<T>(ptr, sizeof(T) * size, sizeof(T) * offset, intent); +} + +template <class T> +inline ApexCudaMemRef<T> createApexCudaMemRef(const ApexMirroredArray<T>& ma, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT) +{ + return ApexCudaMemRef<T>(ma.getGpuPtr(), ma.getByteSize(), intent); +} + +template <class T> +inline ApexCudaMemRef<T> createApexCudaMemRef(const ApexMirroredArray<T>& ma, size_t size, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT) +{ + return ApexCudaMemRef<T>(ma.getGpuPtr(), sizeof(T) * size, intent); +} + +template <class T> +inline ApexCudaMemRef<T> createApexCudaMemRef(const ApexMirroredArray<T>& ma, size_t size, int32_t offset, ApexCudaMemRefBase::Intent intent = ApexCudaMemFlags::IN_OUT) +{ + return ApexCudaMemRef<T>(ma.getGpuPtr(), sizeof(T) * size, sizeof(T) * offset, intent); +} + +#ifndef ALIGN_OFFSET +#define ALIGN_OFFSET(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) +#endif + +#define CUDA_MAX_PARAM_SIZE 256 + + +class ApexCudaTestKernelContext; + + +class ApexCudaConstStorage; + +class ApexCudaModule +{ +public: + ApexCudaModule() + : mCuModule(0), mStorage(0) + { + } + + PX_INLINE void init(const void* image) + { + if (mCuModule == 0) + { + CUT_SAFE_CALL(cuModuleLoadDataEx(&mCuModule, image, 0, NULL, NULL)); + } + } + PX_INLINE void release() + { + if (mCuModule != 0) + { + CUT_SAFE_CALL(cuModuleUnload(mCuModule)); + mCuModule = 0; + } + } + + PX_INLINE bool isValid() const + { + return (mCuModule != 0); + } + + PX_INLINE CUmodule getCuModule() const + { + return mCuModule; + } + + PX_INLINE ApexCudaConstStorage* getStorage() const + { + return mStorage; + } + +private: + CUmodule mCuModule; + ApexCudaConstStorage* mStorage; + + friend class ApexCudaConstStorage; +}; + +class ApexCudaObjManager; + +class ApexCudaObj +{ + friend class ApexCudaObjManager; + ApexCudaObj* mObjListNext; + +protected: + const char* mName; + ApexCudaModule* mCudaModule; + ApexCudaObjManager* mManager; + + ApexCudaObj(const char* name) : mObjListNext(0), mName(name), mCudaModule(NULL), mManager(NULL) {} + virtual ~ApexCudaObj() {} + + PX_INLINE void init(ApexCudaObjManager* manager, ApexCudaModule* cudaModule); + +public: + const char* getName() const + { + return mName; + } + const ApexCudaModule* getCudaModule() const + { + return mCudaModule; + } + + enum ApexCudaObjType + { + UNKNOWN, + FUNCTION, + TEXTURE, + CONST_STORAGE, + SURFACE + }; + virtual ApexCudaObjType getType() + { + return UNKNOWN; + } + + PX_INLINE ApexCudaObj* next() + { + return mObjListNext; + } + virtual void release() = 0; + virtual void formContext(ApexCudaTestKernelContext*) = 0; +}; + +struct ApexCudaDeviceTraits +{ + uint32_t mMaxSharedMemPerBlock; + uint32_t mMaxSharedMemPerSM; + uint32_t mMaxRegistersPerSM; + uint32_t mMaxThreadsPerSM; + + uint32_t mBlocksPerSM; + uint32_t mBlocksPerSM_2D; + uint32_t mBlocksPerSM_3D; + uint32_t mMaxBlocksPerGrid; +}; + +class ApexCudaObjManager +{ + ApexCudaObj* mObjListHead; + + Module* mNxModule; + ApexCudaTestManager* mCudaTestManager; + PxGpuDispatcher* mGpuDispatcher; + + ApexCudaDeviceTraits mDeviceTraits; + +protected: + friend class ApexCudaFunc; + ApexCudaProfileSession* mCudaProfileSession; + +public: + ApexCudaObjManager() : mObjListHead(0), mNxModule(0), mCudaTestManager(0), mGpuDispatcher(0), mCudaProfileSession(0) {} + + void init(Module* nxModule, ApexCudaTestManager* cudaTestManager, PxGpuDispatcher* gpuDispatcher) + { + mNxModule = nxModule; + mCudaTestManager = cudaTestManager; + mGpuDispatcher = gpuDispatcher; + + //get device traits + CUdevice device; + CUT_SAFE_CALL(cuCtxGetDevice(&device)); + CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxSharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device)); + CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxSharedMemPerSM, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device)); + CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxRegistersPerSM, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)); + CUT_SAFE_CALL(cuDeviceGetAttribute((int*)&mDeviceTraits.mMaxThreadsPerSM, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, device)); + +#ifdef APEX_CUDA_FORCED_BLOCKS + mDeviceTraits.mBlocksPerSM = (APEX_CUDA_FORCED_BLOCKS > 32) ? 2u : 1u; + mDeviceTraits.mMaxBlocksPerGrid = APEX_CUDA_FORCED_BLOCKS; +#else + int computeMajor; + int smCount; + CUT_SAFE_CALL(cuDeviceGetAttribute(&smCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); + CUT_SAFE_CALL(cuDeviceGetAttribute(&computeMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); + + mDeviceTraits.mBlocksPerSM = 2;//(computeMajor >= 5) ? 2u : 1u; + mDeviceTraits.mMaxBlocksPerGrid = uint32_t(smCount) * mDeviceTraits.mBlocksPerSM; +#endif + mDeviceTraits.mBlocksPerSM_2D = 4; + mDeviceTraits.mBlocksPerSM_3D = 4; + } + + PX_INLINE const ApexCudaDeviceTraits& getDeviceTraits() const + { + return mDeviceTraits; + } + + PX_INLINE void addToObjList(ApexCudaObj* obj) + { + obj->mObjListNext = mObjListHead; + mObjListHead = obj; + } + + PX_INLINE ApexCudaObj* getObjListHead() + { + return mObjListHead; + } + + void releaseAll() + { + for (ApexCudaObj* obj = mObjListHead; obj != 0; obj = obj->mObjListNext) + { + obj->release(); + } + } + + PX_INLINE Module* getModule() const + { + return mNxModule; + } + PX_INLINE ApexCudaTestManager* getCudaTestManager() const + { + return mCudaTestManager; + } + PX_INLINE PxGpuDispatcher* getGpuDispatcher() const + { + return mGpuDispatcher; + } + +public: + virtual void onBeforeLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) = 0; + virtual void onAfterLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) = 0; + +}; + +PX_INLINE void ApexCudaObj::init(ApexCudaObjManager* manager, ApexCudaModule* cudaModule) +{ + mManager = manager; + mManager->addToObjList(this); + mCudaModule = cudaModule; +} + + +class ApexCudaTexRef : public ApexCudaObj +{ +public: + void init(ApexCudaObjManager* manager, CUtexref texRef, ApexCudaModule* cudaModule, CUarray_format format, int numChannels, int dim, int flags) + { + ApexCudaObj::init(manager, cudaModule); + + mTexRef = texRef; + mDim = dim; + mFormat = format; + mNumChannels = numChannels; + mFlags = flags; + mIsBinded = false; + + CUT_SAFE_CALL(cuTexRefSetFilterMode(mTexRef, mFilterMode)); + + for (int d = 0; d < dim; ++d) + { + CUT_SAFE_CALL(cuTexRefSetAddressMode(mTexRef, d, CU_TR_ADDRESS_MODE_CLAMP)); + } + } + + ApexCudaTexRef(const char* name, CUfilter_mode filterMode = CU_TR_FILTER_MODE_POINT) + : ApexCudaObj(name), mTexRef(0), mFilterMode(filterMode) + { + } + + void setNormalizedCoords() + { + mFlags |= CU_TRSF_NORMALIZED_COORDINATES; + } + + void bindTo(const void* ptr, size_t bytes, size_t* retByteOffset = 0) + { + CUT_SAFE_CALL(cuTexRefSetFormat(mTexRef, mFormat, mNumChannels)); + CUT_SAFE_CALL(cuTexRefSetFlags(mTexRef, (uint32_t)mFlags)); + + size_t byteOffset; + CUT_SAFE_CALL(cuTexRefSetAddress(&byteOffset, mTexRef, CUT_TODEVICE(ptr), static_cast<unsigned int>(bytes))); + + if (retByteOffset != 0) + { + *retByteOffset = byteOffset; + } + else + { + PX_ASSERT(byteOffset == 0); + } + + mBindedSize = bytes; + mBindedPtr = ptr; + mBindedArray = NULL; + mIsBinded = true; + } + + template <typename T> + void bindTo(ApexMirroredArray<T>& mem, size_t* retByteOffset = 0) + { + bindTo(mem.getGpuPtr(), mem.getByteSize(), retByteOffset); + } + + template <typename T> + void bindTo(ApexMirroredArray<T>& mem, size_t size, size_t* retByteOffset = 0) + { + bindTo(mem.getGpuPtr(), sizeof(T) * size, retByteOffset); + } + + void bindTo(CUarray cuArray) + { + CUT_SAFE_CALL(cuTexRefSetFlags(mTexRef, (uint32_t)mFlags)); + + CUT_SAFE_CALL(cuTexRefSetArray(mTexRef, cuArray, CU_TRSA_OVERRIDE_FORMAT)); + + mBindedSize = 0; + mBindedPtr = NULL; + mBindedArray = cuArray; + mIsBinded = true; + } + + void bindTo(const ApexCudaArray& cudaArray) + { + bindTo(cudaArray.getCuArray()); + } + + void unbind() + { + size_t byteOffset; + CUT_SAFE_CALL(cuTexRefSetAddress(&byteOffset, mTexRef, CUdeviceptr(0), 0)); + mIsBinded = false; + } + + virtual ApexCudaObjType getType() + { + return TEXTURE; + } + + virtual void release() {} + + virtual void formContext(ApexCudaTestKernelContext* context) + { + if (mIsBinded) + { + context->addTexRef(mName, mBindedPtr, mBindedSize, mBindedArray); + } + } + +private: + CUtexref mTexRef; + CUfilter_mode mFilterMode; + + CUarray_format mFormat; + int mNumChannels; + int mDim; + int mFlags; + + bool mIsBinded; + size_t mBindedSize; + const void* mBindedPtr; + CUarray mBindedArray; +}; + + +class ApexCudaSurfRef : public ApexCudaObj +{ +public: + void init(ApexCudaObjManager* manager, CUsurfref surfRef, ApexCudaModule* cudaModule) + { + ApexCudaObj::init(manager, cudaModule); + + mSurfRef = surfRef; + + mIsBinded = false; + } + + ApexCudaSurfRef(const char* name) : ApexCudaObj(name), mSurfRef(0) + { + } + + void bindTo(CUarray cuArray, ApexCudaMemFlags::Enum flags) + { + CUDA_ARRAY3D_DESCRIPTOR desc; + CUT_SAFE_CALL(cuArray3DGetDescriptor(&desc, cuArray)); + + CUT_SAFE_CALL(cuSurfRefSetArray(mSurfRef, cuArray, 0)); + + mIsBinded = true; + mBindedArray = cuArray; + mBindedFlags = flags; + } + + void bindTo(const ApexCudaArray& cudaArray, ApexCudaMemFlags::Enum flags) + { + bindTo(cudaArray.getCuArray(), flags); + } + + void unbind() + { + mIsBinded = false; + } + + virtual ApexCudaObjType getType() + { + return SURFACE; + } + + virtual void release() {} + + virtual void formContext(ApexCudaTestKernelContext* context) + { + if (mIsBinded) + { + context->addSurfRef(mName, mBindedArray, mBindedFlags); + } + } + +private: + CUsurfref mSurfRef; + + bool mIsBinded; + CUarray mBindedArray; + ApexCudaMemFlags::Enum mBindedFlags; +}; + +class ApexCudaTexRefScopeBind +{ +private: + ApexCudaTexRefScopeBind& operator=(const ApexCudaTexRefScopeBind&); + ApexCudaTexRef& mTexRef; + +public: + ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, void* ptr, size_t bytes, size_t* retByteOffset = 0) + : mTexRef(texRef) + { + mTexRef.bindTo(ptr, bytes, retByteOffset); + } + template <typename T> + ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, ApexMirroredArray<T>& mem, size_t* retByteOffset = 0) + : mTexRef(texRef) + { + mTexRef.bindTo(mem, retByteOffset); + } + template <typename T> + ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, ApexMirroredArray<T>& mem, size_t size, size_t* retByteOffset = 0) + : mTexRef(texRef) + { + mTexRef.bindTo(mem, size, retByteOffset); + } + ApexCudaTexRefScopeBind(ApexCudaTexRef& texRef, const ApexCudaArray& cudaArray) + : mTexRef(texRef) + { + mTexRef.bindTo(cudaArray); + } + ~ApexCudaTexRefScopeBind() + { + mTexRef.unbind(); + } +}; + +#define APEX_CUDA_TEXTURE_SCOPE_BIND(texRef, mem) ApexCudaTexRefScopeBind texRefScopeBind_##texRef (CUDA_OBJ(texRef), mem); +#define APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRef, mem, size) ApexCudaTexRefScopeBind texRefScopeBind_##texRef (CUDA_OBJ(texRef), mem, size); +#define APEX_CUDA_TEXTURE_SCOPE_BIND_PTR(texRef, ptr, count) ApexCudaTexRefScopeBind texRefScopeBind_##texRef (CUDA_OBJ(texRef), ptr, sizeof(*ptr) * count); +#define APEX_CUDA_TEXTURE_BIND(texRef, mem) CUDA_OBJ(texRef).bindTo(mem); +#define APEX_CUDA_TEXTURE_BIND_PTR(texRef, ptr, count) CUDA_OBJ(texRef).bindTo(ptr, sizeof(*ptr) * count); +#define APEX_CUDA_TEXTURE_UNBIND(texRef) CUDA_OBJ(texRef).unbind(); + + +class ApexCudaSurfRefScopeBind +{ +private: + ApexCudaSurfRefScopeBind& operator=(const ApexCudaSurfRefScopeBind&); + ApexCudaSurfRef& mSurfRef; + +public: + ApexCudaSurfRefScopeBind(ApexCudaSurfRef& surfRef, ApexCudaArray& cudaArray, ApexCudaMemFlags::Enum flags) + : mSurfRef(surfRef) + { + mSurfRef.bindTo(cudaArray, flags); + } + ApexCudaSurfRefScopeBind(ApexCudaSurfRef& surfRef, CUarray cuArray, ApexCudaMemFlags::Enum flags) + : mSurfRef(surfRef) + { + mSurfRef.bindTo(cuArray, flags); + } + ~ApexCudaSurfRefScopeBind() + { + mSurfRef.unbind(); + } +}; + +#define APEX_CUDA_SURFACE_SCOPE_BIND(surfRef, mem, flags) ApexCudaSurfRefScopeBind surfRefScopeBind_##surfRef (CUDA_OBJ(surfRef), mem, flags); +#define APEX_CUDA_SURFACE_BIND(surfRef, mem, flags) CUDA_OBJ(surfRef).bindTo(mem, flags); +#define APEX_CUDA_SURFACE_UNBIND(surfRef) CUDA_OBJ(surfRef).unbind(); + + +class ApexCudaVar : public ApexCudaObj +{ +public: + size_t getSize() const + { + return mSize; + } + + void init(ApexCudaObjManager* manager, ApexCudaModule* cudaModule, CUdeviceptr devPtr, size_t size, PxCudaContextManager* ctx) + { + ApexCudaObj::init(manager, cudaModule); + + mDevPtr = devPtr; + mSize = size; + init(manager, ctx); + } + + virtual void release() {} + virtual void formContext(ApexCudaTestKernelContext*) {} + +protected: + virtual void init(ApexCudaObjManager* , PxCudaContextManager*) = 0; + + ApexCudaVar(const char* name) : ApexCudaObj(name), mDevPtr(0), mSize(0) + { + } + +protected: + CUdeviceptr mDevPtr; + size_t mSize; +}; + + +class ApexCudaConstStorage : public ApexCudaVar, public InplaceStorage +{ +public: + ApexCudaConstStorage(const char* nameVar, const char* nameTexRef) + : ApexCudaVar(nameVar), mCudaTexRef(nameTexRef), mStoreInTexture(false) + { + mStorageSize = 0; + mStoragePtr = 0; + + mHostBuffer = 0; + mDeviceBuffer = 0; + } + + virtual ApexCudaObjType getType() + { + return CONST_STORAGE; + } + + virtual void formContext(ApexCudaTestKernelContext* context) + { + if (!mStoreInTexture && mHostBuffer != 0) + { + PX_ASSERT(mHostBuffer->getSize() >= ApexCudaVar::getSize()); + void* hostPtr = reinterpret_cast<void*>(mHostBuffer->getPtr()); + context->addConstMem(mName, hostPtr, ApexCudaVar::getSize()); + } + } + + virtual void init(ApexCudaObjManager* manager, PxCudaContextManager* ctx) + { + PX_ASSERT(mCudaModule != 0); + PX_ASSERT(mCudaModule->mStorage == 0); + mCudaModule->mStorage = this; + + CUtexref cuTexRef; + CUT_SAFE_CALL(cuModuleGetTexRef(&cuTexRef, mCudaModule->getCuModule(), mCudaTexRef.getName())); + + mCudaTexRef.init(manager, cuTexRef, mCudaModule, CU_AD_FORMAT_SIGNED_INT32, 1, 1, CU_TRSF_READ_AS_INTEGER); + + //prealloc. host buffer for Apex Cuda Test framework + reallocHostBuffer(ctx, ApexCudaVar::getSize()); + } + + virtual void release() + { + InplaceStorage::release(); + + if (mDeviceBuffer != 0) + { + mDeviceBuffer->free(); + mDeviceBuffer = 0; + } + if (mHostBuffer != 0) + { + mHostBuffer->free(); + mHostBuffer = 0; + } + + if (mStoragePtr != 0) + { + getAllocator().deallocate(mStoragePtr); + mStoragePtr = 0; + mStorageSize = 0; + } + } + + bool copyToDevice(PxCudaContextManager* ctx, CUstream stream) + { + if (mStoragePtr == 0) + { + return false; + } + + bool result = false; + + InplaceStorage* storage = static_cast<InplaceStorage*>(this); + mMutex.lock(); + if (storage->isChanged()) + { + if (!reallocHostBuffer(ctx, mStorageSize)) + { + return false; + } + + CUdeviceptr copyDevPtr = 0; + if (mStoreInTexture) + { + if (mDeviceBuffer == 0) + { + mDeviceBuffer = ctx->getMemoryManager()->alloc( + PxCudaBufferType(PxCudaBufferMemorySpace::T_GPU, PxCudaBufferFlags::F_READ_WRITE), + mStorageSize); + if (mDeviceBuffer == 0) + { + APEX_INTERNAL_ERROR("ApexCudaConstStorage failed to allocate GPU Memory!"); + return false; + } + } + else if (mDeviceBuffer->getSize() < mStorageSize) + { + mDeviceBuffer->realloc(mStorageSize); + } + copyDevPtr = mDeviceBuffer->getPtr(); + } + else + { + if (mDeviceBuffer != 0) + { + mDeviceBuffer->free(); + mDeviceBuffer = 0; + } + copyDevPtr = mDevPtr; + } + + uint8_t* hostPtr = reinterpret_cast<uint8_t*>(mHostBuffer->getPtr()); + + size_t size = storage->mapTo(hostPtr); + // padding up to the next dword + size = (size + 7) & ~7; + if (size > mStorageSize) size = mStorageSize; + + CUT_SAFE_CALL(cuMemcpyHtoDAsync(copyDevPtr, hostPtr, size, stream)); + + storage->setUnchanged(); + result = true; + } + mMutex.unlock(); + + return result; + } + + PX_INLINE bool getStoreInTexture() const + { + return mStoreInTexture; + } + + PX_INLINE void onBeforeLaunch() + { + if (mStoreInTexture) + { + mCudaTexRef.bindTo( mDeviceBuffer ? reinterpret_cast<void*>(mDeviceBuffer->getPtr()) : 0, mStorageSize ); + } + } + + PX_INLINE void onAfterLaunch() + { + if (mStoreInTexture) + { + mCudaTexRef.unbind(); + } + } + +protected: + bool reallocHostBuffer(PxCudaContextManager* ctx, size_t size) + { + if (mHostBuffer == 0) + { + mHostBuffer = ctx->getMemoryManager()->alloc( + PxCudaBufferType(PxCudaBufferMemorySpace::T_PINNED_HOST, PxCudaBufferFlags::F_READ_WRITE), + size); + if (mHostBuffer == 0) + { + APEX_INTERNAL_ERROR("ApexCudaConstStorage failed to allocate Pinned Host Memory!"); + return false; + } + } + else if (mHostBuffer->getSize() < size) + { + mHostBuffer->realloc(size); + } + return true; + } + + virtual uint8_t* storageResizeBuffer(uint32_t newSize) + { + if (!mStoreInTexture && newSize > ApexCudaVar::getSize()) + { +#if 0 + APEX_INTERNAL_ERROR("Out of CUDA constant memory"); + PX_ALWAYS_ASSERT(); + return 0; +#else + //switch to texture + mStoreInTexture = true; +#endif + } + else if (mStoreInTexture && newSize <= ApexCudaVar::getSize()) + { + //switch back to const mem. + mStoreInTexture = false; + } + + const uint32_t PageSize = 4096; + size_t allocSize = mStoreInTexture ? (newSize + (PageSize - 1)) & ~(PageSize - 1) : ApexCudaVar::getSize(); + + if (allocSize > mStorageSize) + { + uint8_t* allocStoragePtr = static_cast<uint8_t*>(getAllocator().allocate(allocSize, "ApexCudaConstStorage", __FILE__, __LINE__)); + if (allocStoragePtr == 0) + { + APEX_INTERNAL_ERROR("ApexCudaConstStorage failed to allocate memory!"); + return 0; + } + if (mStoragePtr != 0) + { + memcpy(allocStoragePtr, mStoragePtr, mStorageSize); + getAllocator().deallocate(mStoragePtr); + } + mStorageSize = allocSize; + mStoragePtr = allocStoragePtr; + } + return mStoragePtr; + } + + virtual void storageLock() + { + mMutex.lock(); + } + virtual void storageUnlock() + { + mMutex.unlock(); + } + +private: + bool mStoreInTexture; + ApexCudaTexRef mCudaTexRef; + + size_t mStorageSize; + uint8_t* mStoragePtr; + + PxCudaBuffer* mHostBuffer; + PxCudaBuffer* mDeviceBuffer; + + nvidia::Mutex mMutex; + + friend class ApexCudaTestKernelContextReader; +}; + +typedef InplaceStorageGroup ApexCudaConstMemGroup; + +#define APEX_CUDA_CONST_MEM_GROUP_SCOPE(group) INPLACE_STORAGE_GROUP_SCOPE(group) + + + +struct ApexCudaFuncParams +{ + int mOffset; + char mParams[CUDA_MAX_PARAM_SIZE]; + + ApexCudaFuncParams() : mOffset(0) {} + + +}; + +class ApexCudaFunc : public ApexCudaObj +{ +public: + PX_INLINE bool testNameMatch(const char* name) const + { + if (const char* name$ = strrchr(name, '$')) + { + if (const char* name_ = strrchr(name, '_')) + { + return (nvidia::strncmp(name, mName, (uint32_t)(name_ - name)) == 0); + } + } + return (nvidia::strcmp(name, mName) == 0); + } + + void init(ApexCudaObjManager* manager, const char* name, CUfunction cuFunc, ApexCudaModule* cudaModule) + { + int funcInstIndex = 0; + if (const char* name$ = strrchr(name, '$')) + { + funcInstIndex = atoi(name$ + 1); + } + if (funcInstIndex >= MAX_INST_COUNT) + { + PX_ALWAYS_ASSERT(); + return; + } + + if (mFuncInstCount == 0) + { + ApexCudaObj::init(manager, cudaModule); + } + + PxCudaContextManager* ctx = mManager->mGpuDispatcher->getCudaContextManager(); + { + int funcMaxThreadsPerBlock; + cuFuncGetAttribute(&funcMaxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFunc); + + int funcNumRegsPerThread; + cuFuncGetAttribute(&funcNumRegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, cuFunc); + + int funcSharedMemSize; + cuFuncGetAttribute(&funcSharedMemSize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, cuFunc); + const int sharedMemGranularity = (ctx->supportsArchSM20() ? 128 : 512) - 1; + funcSharedMemSize = (funcSharedMemSize + sharedMemGranularity) & ~sharedMemGranularity; + + FuncInstData& fid = mFuncInstData[funcInstIndex]; + fid.mName = name; + fid.mCuFunc = cuFunc; + fid.mMaxThreadsPerBlock = (uint32_t)funcMaxThreadsPerBlock; + + fid.mNumRegsPerThread = (uint32_t)funcNumRegsPerThread; + fid.mStaticSharedSize = (uint32_t)funcSharedMemSize; + PX_ASSERT(fid.mStaticSharedSize <= mManager->getDeviceTraits().mMaxSharedMemPerBlock); + + fid.mWarpsPerBlock = 0; + fid.mDynamicShared = 0; + } + + init(ctx, funcInstIndex); + mFuncInstCount = PxMax(mFuncInstCount, uint32_t(funcInstIndex) + 1); + } + + virtual ApexCudaObjType getType() + { + return FUNCTION; + } + virtual void release() {} + + virtual void formContext(ApexCudaTestKernelContext*) {} + + /** This function force cuda stream syncronization that may slowdown application + */ + PX_INLINE void setProfileSession(ApexCudaProfileSession* cudaProfileSession) + { + mManager->mCudaProfileSession = cudaProfileSession; + mProfileId = cudaProfileSession ? cudaProfileSession->getProfileId(mName, mManager->mNxModule->getName()) : 0; + } + + PX_INLINE uint32_t getProfileId() const + { + return mProfileId; + } + +protected: + static const int MAX_INST_COUNT = 2; + + struct FuncInstData + { + const char* mName; + CUfunction mCuFunc; + + uint32_t mMaxThreadsPerBlock; + uint32_t mNumRegsPerThread; + uint32_t mStaticSharedSize; + + uint32_t mWarpsPerBlock; + uint32_t mDynamicShared; + }; + + uint32_t mFuncInstCount; + FuncInstData mFuncInstData[MAX_INST_COUNT]; + + uint32_t mProfileId; + ApexCudaTestKernelContext* mCTContext; + + ApexCudaFunc(const char* name) + : ApexCudaObj(name), mFuncInstCount(0), mProfileId(0), mCTContext(0) + { + } + virtual void init(PxCudaContextManager* , int /*funcInstIndex*/) {} + + bool isValid() const + { + return (mFuncInstCount != 0) && (mCudaModule != 0); + } + + const FuncInstData& getFuncInstData() const + { + PX_ASSERT(isValid()); + + ApexCudaConstStorage* storage = mCudaModule->getStorage(); + if (storage != 0 && mFuncInstCount > 1) + { + PX_ASSERT(mFuncInstCount == 2); + return mFuncInstData[ storage->getStoreInTexture() ? 1 : 0 ]; + } + else + { + PX_ASSERT(mFuncInstCount == 1); + return mFuncInstData[0]; + } + } + + PX_INLINE void onBeforeLaunch(CUstream stream) + { + if (ApexCudaConstStorage* storage = mCudaModule->getStorage()) + { + storage->onBeforeLaunch(); + } + + mManager->onBeforeLaunchApexCudaFunc(*this, stream); + } + PX_INLINE void onAfterLaunch(CUstream stream) + { + mManager->onAfterLaunchApexCudaFunc(*this, stream); + + if (ApexCudaConstStorage* storage = mCudaModule->getStorage()) + { + storage->onAfterLaunch(); + } + } + + template <typename T> + void setParam(ApexCudaFuncParams& params, T* ptr) + { + ALIGN_OFFSET(params.mOffset, (int)__alignof(ptr)); + PX_ASSERT(params.mOffset + sizeof(ptr) <= CUDA_MAX_PARAM_SIZE); + memcpy(params.mParams + params.mOffset, &ptr, sizeof(ptr)); + params.mOffset += sizeof(ptr); + mCTContext = NULL; // context can't catch pointers, use instead ApexCudaMemRef + } + + template <typename T> + void setParam(ApexCudaFuncParams& params, const ApexCudaMemRef<T>& memRef) + { + T* ptr = memRef.getPtr(); + ALIGN_OFFSET(params.mOffset, (int)__alignof(ptr)); + PX_ASSERT(params.mOffset + sizeof(ptr) <= CUDA_MAX_PARAM_SIZE); + memcpy(params.mParams + params.mOffset, &ptr, sizeof(ptr)); + params.mOffset += sizeof(ptr); + } + + template <typename T> + void setParam(ApexCudaFuncParams& params, const T& val) + { + ALIGN_OFFSET(params.mOffset, (int)__alignof(val)); + PX_ASSERT(params.mOffset + sizeof(val) <= CUDA_MAX_PARAM_SIZE); + memcpy(params.mParams + params.mOffset, (void*)&val, sizeof(val)); + params.mOffset += sizeof(val); + } + + void resolveContext() + { + mCTContext->startObjList(); + ApexCudaObj* obj = mManager->getObjListHead(); + while(obj) + { + if ((CUmodule)obj->getCudaModule()->getCuModule() == mCudaModule->getCuModule()) + { + obj->formContext(mCTContext); + } + obj = obj->next(); + } + mCTContext->finishObjList(); + } + + template <typename T> + void copyParam(const char* name, const ApexCudaMemRef<T>& memRef) + { + mCTContext->addParam(name, __alignof(void*), memRef.ptr, memRef.size, memRef.intent, memRef.offset); + } + + template <typename T> + void copyParam(const char* name, const T& val) + { + mCTContext->addParam(name, __alignof(val), (void*)&val, sizeof(val)); + } + +private: + template <typename T> + void copyParam(const char* name, const ApexCudaMemRef<T>& memRef, uint32_t fpType) + { + mCTContext->addParam(name, __alignof(void*), memRef.ptr, memRef.size, memRef.intent, memRef.offset, fpType); + } + void setParam(ApexCudaFuncParams& params, unsigned align, unsigned size, void* ptr) + { + ALIGN_OFFSET(params.mOffset, (int)align); + PX_ASSERT(params.mOffset + size <= CUDA_MAX_PARAM_SIZE); + memcpy(params.mParams + params.mOffset, ptr, (uint32_t)size); + params.mOffset += size; + } + friend class ApexCudaTestKernelContextReader; +}; + +template <> +inline void ApexCudaFunc::copyParam<float>(const char* name, const ApexCudaMemRef<float>& memRef) +{ + copyParam(name, memRef, 4); +} + +template <> +inline void ApexCudaFunc::copyParam<float2>(const char* name, const ApexCudaMemRef<float2>& memRef) +{ + copyParam(name, memRef, 4); +} + +template <> +inline void ApexCudaFunc::copyParam<float3>(const char* name, const ApexCudaMemRef<float3>& memRef) +{ + copyParam(name, memRef, 4); +} + +template <> +inline void ApexCudaFunc::copyParam<float4>(const char* name, const ApexCudaMemRef<float4>& memRef) +{ + copyParam(name, memRef, 4); +} + +template <> +inline void ApexCudaFunc::copyParam<double>(const char* name, const ApexCudaMemRef<double>& memRef) +{ + copyParam(name, memRef, 8); +} + + +class ApexCudaTimer +{ +public: + ApexCudaTimer() + : mIsStarted(false) + , mIsFinished(false) + , mStart(NULL) + , mFinish(NULL) + { + } + ~ApexCudaTimer() + { + if (mStart != NULL) + { + CUT_SAFE_CALL(cuEventDestroy(mStart)); + } + if (mFinish != NULL) + { + CUT_SAFE_CALL(cuEventDestroy(mFinish)); + } + } + void init() + { + if (mStart == NULL) + { + CUT_SAFE_CALL(cuEventCreate(&mStart, CU_EVENT_DEFAULT)); + } + if (mFinish == NULL) + { + CUT_SAFE_CALL(cuEventCreate(&mFinish, CU_EVENT_DEFAULT)); + } + } + + void onStart(CUstream stream) + { + if (mStart != NULL) + { + mIsStarted = true; + CUT_SAFE_CALL(cuEventRecord(mStart, stream)); + } + } + void onFinish(CUstream stream) + { + if (mFinish != NULL && mIsStarted) + { + mIsFinished = true; + CUT_SAFE_CALL(cuEventRecord(mFinish, stream)); + } + } + + float getElapsedTime() + { + if (mIsStarted && mIsFinished) + { + mIsStarted = false; + mIsFinished = false; + CUT_SAFE_CALL(cuEventSynchronize(mStart)); + CUT_SAFE_CALL(cuEventSynchronize(mFinish)); + float time; + CUT_SAFE_CALL(cuEventElapsedTime(&time, mStart, mFinish)); + return time; + } + else + { + return 0.0f; + } + } +private: + CUevent mStart, mFinish; + bool mIsStarted; + bool mIsFinished; +}; + +} +} // end namespace nvidia::apex + +#endif //__APEX_CUDA_WRAPPER_H__ |