// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2018 NVIDIA Corporation. All rights reserved. #include "ApexDefs.h" #if APEX_CUDA_SUPPORT #include "Apex.h" #include "ApexSDKIntl.h" #include "SceneIntl.h" #include "ApexCutil.h" #include "CudaModuleScene.h" #include #include #include "PxTaskManager.h" #include "PxGpuDispatcher.h" #include "PxCudaContextManager.h" #define CUDA_KERNEL_CHECK_ALWAYS 0 namespace nvidia { class PhysXGpuIndicator; namespace apex { /** * Workaround hacks for using nvcc --compiler output object files * without linking with CUDART. We must implement our own versions * of these functions that the object files are hard-coded to call into. */ #define MAX_MODULES 64 static void* moduleTable[ MAX_MODULES ]; static int numRegisteredModules = 0; #define MAX_FUNCTIONS 256 typedef struct { int modIndex; const char* funcName; } cuFuncDesc; static cuFuncDesc functionTable[ MAX_FUNCTIONS ]; static int numRegisteredFunctions = 0; const char* funcNameTable[ MAX_FUNCTIONS ]; #define MAX_TEXTURES 256 typedef struct { int modIndex; const char* texRefName; const struct textureReference* texRefData; int dim; int read_normalized_float; } cuTexRefDesc; static cuTexRefDesc textureTable[ MAX_TEXTURES ]; static int numRegisteredTextures = 0; #define MAX_SURFACES 256 typedef struct { int modIndex; const char* surfRefName; const struct surfaceReference* surfRefData; int dim; } cuSurfRefDesc; static cuSurfRefDesc surfaceTable[ MAX_SURFACES ]; static int numRegisteredSurfaces = 0; #define MAX_VARIABLES 256 typedef struct { int modIndex; const char* varName; int size; } cuVarDesc; static cuVarDesc variableTable[ MAX_VARIABLES ]; static int numRegisteredVariables = 0; CudaModuleScene::CudaModuleScene(SceneIntl& scene, Module& module, const char* modulePrefix) : mSceneIntl(scene) { PX_UNUSED(modulePrefix); PxTaskManager* tm = scene.getTaskManager(); PxGpuDispatcher* gd = tm->getGpuDispatcher(); PX_ASSERT(gd != NULL); PxScopedCudaLock _lock_(*gd->getCudaContextManager()); ApexCudaObjManager::init(&module, &scene.getApexCudaTestManager(), gd); mCudaModules.resize((uint32_t)numRegisteredModules); ApexSDKIntl* apexSdk = GetInternalApexSDK(); mPhysXGpuIndicator = apexSdk->registerPhysXIndicatorGpuClient(); } void CudaModuleScene::destroy(SceneIntl&) { { PxScopedCudaLock _lock_(*getGpuDispatcher()->getCudaContextManager()); ApexCudaObjManager::releaseAll(); for (uint32_t i = 0 ; i < mCudaModules.size() ; i++) { mCudaModules[i].release(); } } ApexSDKIntl* apexSdk = GetInternalApexSDK(); apexSdk->unregisterPhysXIndicatorGpuClient(mPhysXGpuIndicator); mPhysXGpuIndicator = NULL; } void CudaModuleScene::onBeforeLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) { if (mCudaProfileSession) { mCudaProfileSession->onFuncStart(func.getProfileId(), stream); } } void CudaModuleScene::onAfterLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) { if (mCudaProfileSession) { mCudaProfileSession->onFuncFinish(func.getProfileId(), stream); } #if !CUDA_KERNEL_CHECK_ALWAYS if (mSceneIntl.getCudaKernelCheckEnabled()) #endif { CUresult ret = cuStreamSynchronize(stream); if ( CUDA_SUCCESS != ret ) { APEX_INTERNAL_ERROR("Cuda Error %d after launch of func '%s'", ret, func.getName()); PX_ALWAYS_ASSERT(); } } } ApexCudaModule* CudaModuleScene::getCudaModule(int modIndex) { mCudaModules[(uint32_t)modIndex].init(moduleTable[(uint32_t)modIndex]); return &mCudaModules[(uint32_t)modIndex]; } void CudaModuleScene::initCudaObj(ApexCudaTexRef& texRef) { const char* texRefName = texRef.getName(); for (int j = 0 ; j < numRegisteredTextures ; j++) { if (nvidia::strcmp(textureTable[j].texRefName, texRefName) == 0) { ApexCudaModule* cudaModule = getCudaModule(textureTable[j].modIndex); PX_ASSERT(cudaModule->isValid()); CUtexref cuTexRef; CUT_SAFE_CALL(cuModuleGetTexRef(&cuTexRef, cudaModule->getCuModule(), texRefName)); const struct textureReference* texRefData = textureTable[j].texRefData; PX_ASSERT(texRefData->channelDesc.x > 0); int numChannels = 1; if (texRefData->channelDesc.y > 0) { PX_ASSERT(texRefData->channelDesc.y == texRefData->channelDesc.x); ++numChannels; } if (texRefData->channelDesc.z > 0) { PX_ASSERT(texRefData->channelDesc.z == texRefData->channelDesc.x); ++numChannels; } if (texRefData->channelDesc.w > 0) { PX_ASSERT(texRefData->channelDesc.w == texRefData->channelDesc.x); ++numChannels; } CUarray_format cuFormat = CUarray_format(0); switch (texRefData->channelDesc.f) { case cudaChannelFormatKindSigned: switch (texRefData->channelDesc.x) { case 8: cuFormat = CU_AD_FORMAT_SIGNED_INT8; break; case 16: cuFormat = CU_AD_FORMAT_SIGNED_INT16; break; case 32: cuFormat = CU_AD_FORMAT_SIGNED_INT32; break; } break; case cudaChannelFormatKindUnsigned: switch (texRefData->channelDesc.x) { case 8: cuFormat = CU_AD_FORMAT_UNSIGNED_INT8; break; case 16: cuFormat = CU_AD_FORMAT_UNSIGNED_INT16; break; case 32: cuFormat = CU_AD_FORMAT_UNSIGNED_INT32; break; } break; case cudaChannelFormatKindFloat: cuFormat = CU_AD_FORMAT_FLOAT; break; default: PX_ASSERT(0); }; PX_ASSERT(cuFormat != 0); int cuFlags = 0; if (textureTable[j].read_normalized_float == 0) { cuFlags |= CU_TRSF_READ_AS_INTEGER; } if (textureTable[j].texRefData->normalized != 0) { cuFlags |= CU_TRSF_NORMALIZED_COORDINATES; } texRef.init(this, cuTexRef, cudaModule, cuFormat, numChannels, textureTable[j].dim, cuFlags); break; } } } void CudaModuleScene::initCudaObj(ApexCudaVar& var) { const char* varName = var.getName(); for (int j = 0 ; j < numRegisteredVariables ; j++) { if (nvidia::strcmp(variableTable[j].varName, varName) == 0) { ApexCudaModule* cudaModule = getCudaModule(variableTable[j].modIndex); PX_ASSERT(cudaModule->isValid()); CUdeviceptr cuDevPtr; size_t size; cuModuleGetGlobal(&cuDevPtr, &size, cudaModule->getCuModule(), varName); var.init(this, cudaModule, cuDevPtr, size, getGpuDispatcher()->getCudaContextManager()); break; } } } void CudaModuleScene::initCudaObj(ApexCudaFunc& func) { for (int j = 0 ; j < numRegisteredFunctions ; j++) { const char* funcName = functionTable[j].funcName; if (func.testNameMatch(funcName)) { ApexCudaModule* cudaModule = getCudaModule(functionTable[j].modIndex); PX_ASSERT(cudaModule->isValid()); CUfunction cuFunc = 0; CUT_SAFE_CALL(cuModuleGetFunction(&cuFunc, cudaModule->getCuModule(), funcName)); func.init(this, funcName, cuFunc, cudaModule); } } } void CudaModuleScene::initCudaObj(ApexCudaSurfRef& surfRef) { if (getGpuDispatcher()->getCudaContextManager()->supportsArchSM20() == false) { return; } const char* surfRefName = surfRef.getName(); for (int j = 0 ; j < numRegisteredSurfaces ; j++) { if (nvidia::strcmp(surfaceTable[j].surfRefName, surfRefName) == 0) { ApexCudaModule* cudaModule = getCudaModule(surfaceTable[j].modIndex); PX_ASSERT(cudaModule->isValid()); CUsurfref cuSurfRef; CUT_SAFE_CALL(cuModuleGetSurfRef(&cuSurfRef, cudaModule->getCuModule(), surfRefName)); surfRef.init(this, cuSurfRef, cudaModule); break; } } } /* * These calls are all made _before_ main() during static initialization * of your APEX module. So calling into APEX Framework or other * external code modules is out of the question. */ #include "driver_types.h" #define CUDARTAPI __stdcall typedef struct uint3_t { unsigned int x, y, z; } uint3; typedef struct dim3_t { unsigned int x, y, z; } dim3; extern "C" void** CUDARTAPI __cudaRegisterFatBinary(void* fatBin) { //HACK to get real fatbin in CUDA 4.0 struct CUIfatbinStruct { int magic; int version; void* fatbinArray; char* fatbinFile; }; const CUIfatbinStruct* fatbinStruct = (const CUIfatbinStruct*)fatBin; if (fatbinStruct->magic == 0x466243B1) { fatBin = fatbinStruct->fatbinArray; } if (numRegisteredModules < MAX_MODULES) { moduleTable[ numRegisteredModules ] = fatBin; return (void**)(size_t) numRegisteredModules++; } return NULL; } extern "C" void CUDARTAPI __cudaUnregisterFatBinary(void** fatCubinHandle) { moduleTable[(int)(size_t) fatCubinHandle ] = 0; } extern "C" void CUDARTAPI __cudaRegisterTexture( void** fatCubinHandle, const struct textureReference* hostvar, const void** deviceAddress, const char* deviceName, int dim, int read_normalized_float, int ext) { PX_UNUSED(fatCubinHandle); PX_UNUSED(hostvar); PX_UNUSED(deviceAddress); PX_UNUSED(deviceName); PX_UNUSED(dim); PX_UNUSED(read_normalized_float); PX_UNUSED(ext); if (numRegisteredTextures < MAX_TEXTURES) { //Fix for CUDA 5.5 - remove leading "::" while (*deviceName == ':') { ++deviceName; } // We need this association of function to module in order to find textures and globals textureTable[ numRegisteredTextures ].modIndex = (int)(size_t) fatCubinHandle; textureTable[ numRegisteredTextures ].texRefName = deviceName; textureTable[ numRegisteredTextures ].texRefData = hostvar; textureTable[ numRegisteredTextures ].dim = dim; textureTable[ numRegisteredTextures ].read_normalized_float = read_normalized_float; numRegisteredTextures++; } } extern "C" void CUDARTAPI __cudaRegisterSurface( void** fatCubinHandle, const struct surfaceReference* hostvar, const void** deviceAddress, const char* deviceName, int dim, int ext) { PX_UNUSED(fatCubinHandle); PX_UNUSED(hostvar); PX_UNUSED(deviceAddress); PX_UNUSED(deviceName); PX_UNUSED(dim); PX_UNUSED(ext); if (numRegisteredSurfaces < MAX_SURFACES) { //Fix for CUDA 5.5 - remove leading "::" while (*deviceName == ':') { ++deviceName; } surfaceTable[ numRegisteredSurfaces ].modIndex = (int)(size_t) fatCubinHandle; surfaceTable[ numRegisteredSurfaces ].surfRefName = deviceName; surfaceTable[ numRegisteredSurfaces ].surfRefData = hostvar; surfaceTable[ numRegisteredSurfaces ].dim = dim; numRegisteredSurfaces++; } } extern "C" void CUDARTAPI __cudaRegisterVar( void** fatCubinHandle, char* hostVar, char* deviceAddress, const char* deviceName, int ext, int size, int constant, int global) { PX_UNUSED(fatCubinHandle); PX_UNUSED(hostVar); PX_UNUSED(deviceAddress); PX_UNUSED(deviceName); PX_UNUSED(ext); PX_UNUSED(size); PX_UNUSED(constant); PX_UNUSED(global); if (constant != 0 && numRegisteredVariables < MAX_VARIABLES) { variableTable[ numRegisteredVariables ].modIndex = (int)(size_t) fatCubinHandle; variableTable[ numRegisteredVariables ].varName = deviceName; variableTable[ numRegisteredVariables ].size = size; numRegisteredVariables++; } } extern "C" void CUDARTAPI __cudaRegisterShared( void** fatCubinHandle, void** devicePtr ) { PX_UNUSED(fatCubinHandle); PX_UNUSED(devicePtr); } extern "C" void CUDARTAPI __cudaRegisterFunction( void** fatCubinHandle, const char* hostFun, char* deviceFun, const char* deviceName, int thread_limit, uint3* tid, uint3* bid, dim3* bDim, dim3* gDim, int* wSize) { PX_UNUSED(hostFun); PX_UNUSED(deviceFun); PX_UNUSED(thread_limit); PX_UNUSED(tid); PX_UNUSED(bid); PX_UNUSED(bDim); PX_UNUSED(gDim); PX_UNUSED(wSize); if (numRegisteredFunctions < MAX_FUNCTIONS) { // We need this association of function to module in order to find textures and globals functionTable[ numRegisteredFunctions ].modIndex = (int)(size_t) fatCubinHandle; functionTable[ numRegisteredFunctions ].funcName = deviceName; funcNameTable[ numRegisteredFunctions ] = deviceName; numRegisteredFunctions++; } } /* These functions are implemented just to resolve link dependencies */ extern "C" cudaError_t CUDARTAPI cudaLaunch(const char* entry) { PX_UNUSED(entry); return cudaSuccess; } extern "C" cudaError_t CUDARTAPI cudaSetupArgument( const void* arg, size_t size, size_t offset) { PX_UNUSED(arg); PX_UNUSED(size); PX_UNUSED(offset); return cudaSuccess; } extern "C" struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc( int x, int y, int z, int w, enum cudaChannelFormatKind f) { struct cudaChannelFormatDesc desc; desc.x = x; desc.y = y; desc.z = z; desc.w = w; desc.f = f; return desc; } } } // namespace nvidia #endif