From 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 Mon Sep 17 00:00:00 2001 From: git perforce import user Date: Tue, 25 Oct 2016 12:29:14 -0600 Subject: Initial commit: PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167] --- APEX_1.4/module/common/src/CudaModuleScene.cpp | 545 +++++++++++++++++++++++++ 1 file changed, 545 insertions(+) create mode 100644 APEX_1.4/module/common/src/CudaModuleScene.cpp (limited to 'APEX_1.4/module/common/src/CudaModuleScene.cpp') diff --git a/APEX_1.4/module/common/src/CudaModuleScene.cpp b/APEX_1.4/module/common/src/CudaModuleScene.cpp new file mode 100644 index 00000000..34a8fccd --- /dev/null +++ b/APEX_1.4/module/common/src/CudaModuleScene.cpp @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + + +#include "ApexDefs.h" +#if APEX_CUDA_SUPPORT + +#include "Apex.h" +#include "ApexSDKIntl.h" +#include "SceneIntl.h" +#include "ApexCutil.h" +#include "CudaModuleScene.h" +#include +#include + +#include "PxTaskManager.h" +#include "PxGpuDispatcher.h" +#include "PxCudaContextManager.h" + +#define CUDA_KERNEL_CHECK_ALWAYS 0 + +namespace nvidia +{ +class PhysXGpuIndicator; + +namespace apex +{ + +/** + * Workaround hacks for using nvcc --compiler output object files + * without linking with CUDART. We must implement our own versions + * of these functions that the object files are hard-coded to call into. + */ + +#define MAX_MODULES 64 +static void* moduleTable[ MAX_MODULES ]; +static int numRegisteredModules = 0; + +#define MAX_FUNCTIONS 256 +typedef struct +{ + int modIndex; + const char* funcName; +} cuFuncDesc; +static cuFuncDesc functionTable[ MAX_FUNCTIONS ]; +static int numRegisteredFunctions = 0; + +const char* funcNameTable[ MAX_FUNCTIONS ]; + + +#define MAX_TEXTURES 256 +typedef struct +{ + int modIndex; + const char* texRefName; + const struct textureReference* texRefData; + int dim; + int read_normalized_float; +} cuTexRefDesc; +static cuTexRefDesc textureTable[ MAX_TEXTURES ]; +static int numRegisteredTextures = 0; + + +#define MAX_SURFACES 256 +typedef struct +{ + int modIndex; + const char* surfRefName; + const struct surfaceReference* surfRefData; + int dim; +} cuSurfRefDesc; +static cuSurfRefDesc surfaceTable[ MAX_SURFACES ]; +static int numRegisteredSurfaces = 0; + + +#define MAX_VARIABLES 256 +typedef struct +{ + int modIndex; + const char* varName; + int size; +} cuVarDesc; +static cuVarDesc variableTable[ MAX_VARIABLES ]; +static int numRegisteredVariables = 0; + +CudaModuleScene::CudaModuleScene(SceneIntl& scene, Module& module, const char* modulePrefix) + : mSceneIntl(scene) +{ + PX_UNUSED(modulePrefix); + + PxTaskManager* tm = scene.getTaskManager(); + PxGpuDispatcher* gd = tm->getGpuDispatcher(); + PX_ASSERT(gd != NULL); + PxScopedCudaLock _lock_(*gd->getCudaContextManager()); + + ApexCudaObjManager::init(&module, &scene.getApexCudaTestManager(), gd); + + mCudaModules.resize((uint32_t)numRegisteredModules); + + ApexSDKIntl* apexSdk = GetInternalApexSDK(); + mPhysXGpuIndicator = apexSdk->registerPhysXIndicatorGpuClient(); +} + +void CudaModuleScene::destroy(SceneIntl&) +{ + { + PxScopedCudaLock _lock_(*getGpuDispatcher()->getCudaContextManager()); + + ApexCudaObjManager::releaseAll(); + + for (uint32_t i = 0 ; i < mCudaModules.size() ; i++) + { + mCudaModules[i].release(); + } + } + + ApexSDKIntl* apexSdk = GetInternalApexSDK(); + apexSdk->unregisterPhysXIndicatorGpuClient(mPhysXGpuIndicator); + mPhysXGpuIndicator = NULL; +} + +void CudaModuleScene::onBeforeLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) +{ + if (mCudaProfileSession) + { + mCudaProfileSession->onFuncStart(func.getProfileId(), stream); + } +} + +void CudaModuleScene::onAfterLaunchApexCudaFunc(const ApexCudaFunc& func, CUstream stream) +{ + if (mCudaProfileSession) + { + mCudaProfileSession->onFuncFinish(func.getProfileId(), stream); + } + +#if !CUDA_KERNEL_CHECK_ALWAYS + if (mSceneIntl.getCudaKernelCheckEnabled()) +#endif + { + CUresult ret = cuStreamSynchronize(stream); + if ( CUDA_SUCCESS != ret ) + { + APEX_INTERNAL_ERROR("Cuda Error %d after launch of func '%s'", ret, func.getName()); + PX_ALWAYS_ASSERT(); + } + } +} + +ApexCudaModule* CudaModuleScene::getCudaModule(int modIndex) +{ + mCudaModules[(uint32_t)modIndex].init(moduleTable[(uint32_t)modIndex]); + return &mCudaModules[(uint32_t)modIndex]; +} + +void CudaModuleScene::initCudaObj(ApexCudaTexRef& texRef) +{ + const char* texRefName = texRef.getName(); + + for (int j = 0 ; j < numRegisteredTextures ; j++) + { + if (nvidia::strcmp(textureTable[j].texRefName, texRefName) == 0) + { + ApexCudaModule* cudaModule = getCudaModule(textureTable[j].modIndex); + PX_ASSERT(cudaModule->isValid()); + + CUtexref cuTexRef; + CUT_SAFE_CALL(cuModuleGetTexRef(&cuTexRef, cudaModule->getCuModule(), texRefName)); + + const struct textureReference* texRefData = textureTable[j].texRefData; + + PX_ASSERT(texRefData->channelDesc.x > 0); + int numChannels = 1; + if (texRefData->channelDesc.y > 0) + { + PX_ASSERT(texRefData->channelDesc.y == texRefData->channelDesc.x); + ++numChannels; + } + if (texRefData->channelDesc.z > 0) + { + PX_ASSERT(texRefData->channelDesc.z == texRefData->channelDesc.x); + ++numChannels; + } + if (texRefData->channelDesc.w > 0) + { + PX_ASSERT(texRefData->channelDesc.w == texRefData->channelDesc.x); + ++numChannels; + } + + CUarray_format cuFormat = CUarray_format(0); + switch (texRefData->channelDesc.f) + { + case cudaChannelFormatKindSigned: + switch (texRefData->channelDesc.x) + { + case 8: + cuFormat = CU_AD_FORMAT_SIGNED_INT8; + break; + case 16: + cuFormat = CU_AD_FORMAT_SIGNED_INT16; + break; + case 32: + cuFormat = CU_AD_FORMAT_SIGNED_INT32; + break; + } + break; + case cudaChannelFormatKindUnsigned: + switch (texRefData->channelDesc.x) + { + case 8: + cuFormat = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case 16: + cuFormat = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case 32: + cuFormat = CU_AD_FORMAT_UNSIGNED_INT32; + break; + } + break; + case cudaChannelFormatKindFloat: + cuFormat = CU_AD_FORMAT_FLOAT; + break; + default: + PX_ASSERT(0); + }; + PX_ASSERT(cuFormat != 0); + + int cuFlags = 0; + if (textureTable[j].read_normalized_float == 0) + { + cuFlags |= CU_TRSF_READ_AS_INTEGER; + } + if (textureTable[j].texRefData->normalized != 0) + { + cuFlags |= CU_TRSF_NORMALIZED_COORDINATES; + } + + texRef.init(this, cuTexRef, cudaModule, cuFormat, numChannels, textureTable[j].dim, cuFlags); + break; + } + } +} + +void CudaModuleScene::initCudaObj(ApexCudaVar& var) +{ + const char* varName = var.getName(); + + for (int j = 0 ; j < numRegisteredVariables ; j++) + { + if (nvidia::strcmp(variableTable[j].varName, varName) == 0) + { + ApexCudaModule* cudaModule = getCudaModule(variableTable[j].modIndex); + PX_ASSERT(cudaModule->isValid()); + + CUdeviceptr cuDevPtr; + size_t size; + cuModuleGetGlobal(&cuDevPtr, &size, cudaModule->getCuModule(), varName); + + var.init(this, cudaModule, cuDevPtr, size, getGpuDispatcher()->getCudaContextManager()); + break; + } + } +} + +void CudaModuleScene::initCudaObj(ApexCudaFunc& func) +{ + for (int j = 0 ; j < numRegisteredFunctions ; j++) + { + const char* funcName = functionTable[j].funcName; + if (func.testNameMatch(funcName)) + { + ApexCudaModule* cudaModule = getCudaModule(functionTable[j].modIndex); + PX_ASSERT(cudaModule->isValid()); + + CUfunction cuFunc = 0; + CUT_SAFE_CALL(cuModuleGetFunction(&cuFunc, cudaModule->getCuModule(), funcName)); + + func.init(this, funcName, cuFunc, cudaModule); + } + } +} + +void CudaModuleScene::initCudaObj(ApexCudaSurfRef& surfRef) +{ + if (getGpuDispatcher()->getCudaContextManager()->supportsArchSM20() == false) + { + return; + } + + const char* surfRefName = surfRef.getName(); + + for (int j = 0 ; j < numRegisteredSurfaces ; j++) + { + if (nvidia::strcmp(surfaceTable[j].surfRefName, surfRefName) == 0) + { + ApexCudaModule* cudaModule = getCudaModule(surfaceTable[j].modIndex); + PX_ASSERT(cudaModule->isValid()); + + CUsurfref cuSurfRef; + CUT_SAFE_CALL(cuModuleGetSurfRef(&cuSurfRef, cudaModule->getCuModule(), surfRefName)); + + surfRef.init(this, cuSurfRef, cudaModule); + break; + } + } + +} + +/* + * These calls are all made _before_ main() during static initialization + * of your APEX module. So calling into APEX Framework or other + * external code modules is out of the question. + */ + +#include "driver_types.h" + +#define CUDARTAPI __stdcall + +typedef struct uint3_t +{ + unsigned int x, y, z; +} uint3; + +typedef struct dim3_t +{ + unsigned int x, y, z; +} dim3; + +extern "C" +void** CUDARTAPI __cudaRegisterFatBinary(void* fatBin) +{ + //HACK to get real fatbin in CUDA 4.0 + struct CUIfatbinStruct + { + int magic; + int version; + void* fatbinArray; + char* fatbinFile; + }; + const CUIfatbinStruct* fatbinStruct = (const CUIfatbinStruct*)fatBin; + if (fatbinStruct->magic == 0x466243B1) + { + fatBin = fatbinStruct->fatbinArray; + } + + if (numRegisteredModules < MAX_MODULES) + { + moduleTable[ numRegisteredModules ] = fatBin; + return (void**)(size_t) numRegisteredModules++; + } + return NULL; +} + +extern "C" +void CUDARTAPI __cudaUnregisterFatBinary(void** fatCubinHandle) +{ + moduleTable[(int)(size_t) fatCubinHandle ] = 0; +} + +extern "C" +void CUDARTAPI __cudaRegisterTexture( + void** fatCubinHandle, + const struct textureReference* hostvar, + const void** deviceAddress, + const char* deviceName, + int dim, + int read_normalized_float, + int ext) +{ + PX_UNUSED(fatCubinHandle); + PX_UNUSED(hostvar); + PX_UNUSED(deviceAddress); + PX_UNUSED(deviceName); + PX_UNUSED(dim); + PX_UNUSED(read_normalized_float); + PX_UNUSED(ext); + + if (numRegisteredTextures < MAX_TEXTURES) + { + //Fix for CUDA 5.5 - remove leading "::" + while (*deviceName == ':') + { + ++deviceName; + } + + // We need this association of function to module in order to find textures and globals + textureTable[ numRegisteredTextures ].modIndex = (int)(size_t) fatCubinHandle; + textureTable[ numRegisteredTextures ].texRefName = deviceName; + textureTable[ numRegisteredTextures ].texRefData = hostvar; + textureTable[ numRegisteredTextures ].dim = dim; + textureTable[ numRegisteredTextures ].read_normalized_float = read_normalized_float; + numRegisteredTextures++; + } +} + +extern "C" +void CUDARTAPI __cudaRegisterSurface( + void** fatCubinHandle, + const struct surfaceReference* hostvar, + const void** deviceAddress, + const char* deviceName, + int dim, + int ext) +{ + PX_UNUSED(fatCubinHandle); + PX_UNUSED(hostvar); + PX_UNUSED(deviceAddress); + PX_UNUSED(deviceName); + PX_UNUSED(dim); + PX_UNUSED(ext); + + if (numRegisteredSurfaces < MAX_SURFACES) + { + //Fix for CUDA 5.5 - remove leading "::" + while (*deviceName == ':') + { + ++deviceName; + } + + surfaceTable[ numRegisteredSurfaces ].modIndex = (int)(size_t) fatCubinHandle; + surfaceTable[ numRegisteredSurfaces ].surfRefName = deviceName; + surfaceTable[ numRegisteredSurfaces ].surfRefData = hostvar; + surfaceTable[ numRegisteredSurfaces ].dim = dim; + numRegisteredSurfaces++; + } +} + +extern "C" void CUDARTAPI __cudaRegisterVar( + void** fatCubinHandle, + char* hostVar, + char* deviceAddress, + const char* deviceName, + int ext, + int size, + int constant, + int global) +{ + PX_UNUSED(fatCubinHandle); + PX_UNUSED(hostVar); + PX_UNUSED(deviceAddress); + PX_UNUSED(deviceName); + PX_UNUSED(ext); + PX_UNUSED(size); + PX_UNUSED(constant); + PX_UNUSED(global); + + if (constant != 0 && numRegisteredVariables < MAX_VARIABLES) + { + variableTable[ numRegisteredVariables ].modIndex = (int)(size_t) fatCubinHandle; + variableTable[ numRegisteredVariables ].varName = deviceName; + variableTable[ numRegisteredVariables ].size = size; + numRegisteredVariables++; + } +} + + +extern "C" void CUDARTAPI __cudaRegisterShared( + void** fatCubinHandle, + void** devicePtr +) +{ + PX_UNUSED(fatCubinHandle); + PX_UNUSED(devicePtr); +} + + + +extern "C" +void CUDARTAPI __cudaRegisterFunction( + void** fatCubinHandle, + const char* hostFun, + char* deviceFun, + const char* deviceName, + int thread_limit, + uint3* tid, + uint3* bid, + dim3* bDim, + dim3* gDim, + int* wSize) +{ + PX_UNUSED(hostFun); + PX_UNUSED(deviceFun); + PX_UNUSED(thread_limit); + PX_UNUSED(tid); + PX_UNUSED(bid); + PX_UNUSED(bDim); + PX_UNUSED(gDim); + PX_UNUSED(wSize); + + if (numRegisteredFunctions < MAX_FUNCTIONS) + { + // We need this association of function to module in order to find textures and globals + functionTable[ numRegisteredFunctions ].modIndex = (int)(size_t) fatCubinHandle; + functionTable[ numRegisteredFunctions ].funcName = deviceName; + funcNameTable[ numRegisteredFunctions ] = deviceName; + numRegisteredFunctions++; + } +} + +/* These functions are implemented just to resolve link dependencies */ + +extern "C" +cudaError_t CUDARTAPI cudaLaunch(const char* entry) +{ + PX_UNUSED(entry); + return cudaSuccess; +} + +extern "C" +cudaError_t CUDARTAPI cudaSetupArgument( + const void* arg, + size_t size, + size_t offset) +{ + PX_UNUSED(arg); + PX_UNUSED(size); + PX_UNUSED(offset); + return cudaSuccess; +} + +extern "C" +struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc( + int x, int y, int z, int w, enum cudaChannelFormatKind f) +{ + struct cudaChannelFormatDesc desc; + desc.x = x; + desc.y = y; + desc.z = z; + desc.w = w; + desc.f = f; + return desc; +} + +} +} // namespace nvidia + +#endif -- cgit v1.2.3