diff options
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp')
| -rw-r--r-- | PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp | 556 |
1 files changed, 556 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp new file mode 100644 index 00000000..68238664 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp @@ -0,0 +1,556 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxProfiler.h" +#include "CuSolver.h" +#include "CuCloth.h" +#include "ClothImpl.h" +#include "CuFabric.h" +#include "CuFactory.h" +#include "CuSolverKernel.h" +#include "CuContextLock.h" +#include "CuCheckSuccess.h" +#include "IterationState.h" +#include "CudaKernelWrangler.h" +#include "PsUtilities.h" +#include "PsSort.h" +#include "PsFoundation.h" + +#if PX_NVTX +#include "nvToolsExt.h" +#endif + +//#define ENABLE_CUDA_PRINTF PX_DEBUG // warning: not thread safe +#define ENABLE_CUDA_PRINTF 0 + +#if ENABLE_CUDA_PRINTF +extern "C" cudaError_t cudaPrintfInit(CUmodule hmod, size_t bufferLen = 1048576); +extern "C" void cudaPrintfEnd(); +extern "C" cudaError_t cudaPrintfDisplay(CUmodule hmod, void* outputFP = NULL, bool showThreadID = false); +#endif + +using namespace physx; + +namespace +{ +//for KernelWrangler interface +const char* gKernelName = cloth::getKernelFunctionName(); +} + +namespace +{ +template <typename T> +struct CuDeviceAllocator +{ + CuDeviceAllocator(physx::PxCudaContextManager* ctx) : mManager(ctx->getMemoryManager()) + { + } + + T* allocate(size_t n) + { + return reinterpret_cast<T*>(mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, n * sizeof(T))); + } + + void deallocate(T* ptr) + { + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, reinterpret_cast<physx::PxCudaBufferPtr>(ptr)); + } + + physx::PxCudaMemoryManager* mManager; +}; +} + +cloth::CuSolver::CuSolver(CuFactory& factory) +: CuContextLock(factory) +, mFactory(factory) +, mClothData(mFactory.mContextManager) +, mClothDataHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocWriteCombined)) +, mClothDataDirty(false) +, mFrameData(getMappedAllocator<CuFrameData>(mFactory.mContextManager)) +, mIterationData(getMappedAllocator<CuIterationData>(mFactory.mContextManager)) +, mIterationDataBegin(0) +, mFrameDt(0.0f) +, mSharedMemorySize(0) +, mSharedMemoryLimit(0) +, mStartSimulationTask(&CuSolver::beginFrame, "cloth.CuSolver.startSimulation") +, mKernelSimulationTask(&CuSolver::executeKernel, "cloth.CuSolver.kernelSimulation") +, mEndSimulationTask(&CuSolver::endFrame, "cloth.CuSolver.endSimulation") +, mStream(0) +, mKernelModule(0) +, mKernelFunction(0) +, mKernelSharedMemorySize(0) +, mClothIndex(CuDeviceAllocator<uint32_t>(mFactory.mContextManager).allocate(1)) +, mInterCollisionDistance(0.0f) +, mInterCollisionStiffness(1.0f) +, mInterCollisionIterations(1) +, mInterCollisionScratchMem(NULL) +, mInterCollisionScratchMemSize(0) +, mKernelWrangler(getDispatcher(), physx::shdfnd::getFoundation().getErrorCallback(), &gKernelName, 1) +, mSimulateNvtxRangeId(0) +, mCudaError(mKernelWrangler.hadError()) +{ + if(mCudaError) + { + CuContextLock::release(); + return; + } + + mStartSimulationTask.mSolver = this; + mKernelSimulationTask.mSolver = this; + mEndSimulationTask.mSolver = this; + + if(mFactory.mContextManager->getUsingConcurrentStreams()) + checkSuccess(cuStreamCreate(&mStream, 0)); + + if(1) + { + mKernelModule = mKernelWrangler.getCuModule(0); + mKernelFunction = mKernelWrangler.getCuFunction(0); + } + else + { + // load from ptx instead of embedded SASS, for iterating without recompile + checkSuccess(cuModuleLoad(&mKernelModule, "CuSolverKernel.ptx")); + checkSuccess(cuModuleGetFunction(&mKernelFunction, mKernelModule, getKernelFunctionName())); + shdfnd::getFoundation().error(PX_INFO, "Cloth kernel code loaded from CuSolverKernel.ptx"); + } + + // get amount of statically allocated shared memory + checkSuccess(cuFuncGetAttribute(&mKernelSharedMemorySize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, mKernelFunction)); + + // extract CuKernelData device pointer + size_t size = 0; + CUdeviceptr ptr = 0; + checkSuccess(cuModuleGetGlobal(&ptr, &size, mKernelModule, getKernelDataName())); + mKernelData = CuDevicePointer<CuKernelData>(reinterpret_cast<CuKernelData*>(ptr)); + + // initialize cloth index + checkSuccess(cuMemsetD32(mClothIndex.dev(), 0, 1)); + + CuContextLock::release(); +} + +cloth::CuSolver::~CuSolver() +{ + PX_ASSERT(mCloths.empty()); + + CuContextLock::acquire(); + + CuKernelData kernelData = {}; + *mKernelData = kernelData; + + CuDeviceAllocator<uint32_t>(mFactory.mContextManager).deallocate(mClothIndex.get()); + + if(mStream) + checkSuccess(cuStreamDestroy(mStream)); + + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); +} + +void cloth::CuSolver::updateKernelData() +{ + CuKernelData kernelData; + + kernelData.mClothIndex = mClothIndex.get(); + kernelData.mClothData = mClothData.begin().get(); + kernelData.mFrameData = getDevicePointer(mFrameData); + + *mKernelData = kernelData; +} + +physx::PxGpuDispatcher& cloth::CuSolver::getDispatcher() const +{ + return *mFactory.mContextManager->getGpuDispatcher(); +} + +namespace +{ +struct ClothSimCostGreater +{ + bool operator()(const cloth::CuCloth* left, const cloth::CuCloth* right) const + { + return left->mNumParticles * left->mSolverFrequency > right->mNumParticles * right->mSolverFrequency; + } +}; +} + +void cloth::CuSolver::addCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + PX_ASSERT(mCloths.find(&cuCloth) == mCloths.end()); + + mCloths.pushBack(&cuCloth); + // trigger update of mClothData array + cuCloth.notifyChanged(); + + // sort cloth instances by size + shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater()); + + CuContextLock contextLock(mFactory); + + // resize containers and update kernel data + mClothDataHostCopy.resize(mCloths.size()); + mClothData.resize(mCloths.size()); + mFrameData.resize(mCloths.size()); + updateKernelData(); +} + +void cloth::CuSolver::removeCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + ClothVector::Iterator begin = mCloths.begin(), end = mCloths.end(); + ClothVector::Iterator it = mCloths.find(&cuCloth); + + if(it == end) + return; // not found + + uint32_t index = uint32_t(it - begin); + + mCloths.remove(index); + mClothDataHostCopy.remove(index); + mClothData.resize(mCloths.size()); + mClothDataDirty = true; +} + +physx::PxBaseTask& cloth::CuSolver::simulate(float dt, physx::PxBaseTask& continuation) +{ + mFrameDt = dt; + + if(mCloths.empty() || mCudaError) + { + continuation.addReference(); + return continuation; + } + + physx::PxGpuDispatcher& disp = getDispatcher(); + mEndSimulationTask.setContinuation(&continuation); + disp.addPostLaunchDependent(mEndSimulationTask); + mKernelSimulationTask.setContinuation(&disp.getPostLaunchTask()); + disp.getPostLaunchTask().removeReference(); + disp.addPreLaunchDependent(mKernelSimulationTask); + mStartSimulationTask.setContinuation(&disp.getPreLaunchTask()); + disp.getPreLaunchTask().removeReference(); + + mEndSimulationTask.removeReference(); + mKernelSimulationTask.removeReference(); + + return mStartSimulationTask; +} + +void cloth::CuSolver::beginFrame() +{ + CuContextLock contextLock(mFactory); + + PX_PROFILE_START_CROSSTHREAD("cloth.CuSolver.simulate", 0); + + CuIterationData* iterationDataBegin = mIterationData.empty() ? 0 : &mIterationData.front(); + + mFrameData.resize(0); + mIterationData.resize(0); + + // update cloth data + ClothVector::Iterator cIt, cEnd = mCloths.end(); + CuPinnedVector<CuClothData>::Type::Iterator dIt = mClothDataHostCopy.begin(); + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++dIt) + mClothDataDirty |= (*cIt)->updateClothData(*dIt); + + if(mClothDataDirty) + { + /* find optimal number of cloths per SM */ + + // at least 192 threads per block (e.g. CuCollision::buildAcceleration) + uint32_t numSMs = (uint32_t)mFactory.mContextManager->getMultiprocessorCount(); + uint32_t maxClothsPerSM = PxMin(mFactory.mMaxThreadsPerBlock / 192, (mCloths.size() + numSMs - 1) / numSMs); + + // tuning parameters: relative performance per numSharedPositions + float weights[3] = { 0.4f, 0.8f, 1.0f }; + + // try all possible number of cloths per SM and estimate performance + float maxWeightSum = 0.0f; + uint32_t numClothsPerSM = 0; + for(uint32_t i = 1; i <= maxClothsPerSM; ++i) + { + uint32_t sharedMemoryLimit = (mFactory.mContextManager->getSharedMemPerBlock() / i) - mKernelSharedMemorySize; + + float weightSum = 0.0f; + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + uint32_t sharedMemorySize = (*cIt)->mSharedMemorySize; + uint32_t positionsSize = (*cIt)->mNumParticles * sizeof(PxVec4); + + if(sharedMemorySize > sharedMemoryLimit) + break; + + uint32_t numSharedPositions = PxMin(2u, (sharedMemoryLimit - sharedMemorySize) / positionsSize); + + weightSum += weights[numSharedPositions] * positionsSize; + } + // tuning parameter: inverse performance for running i cloths per SM + weightSum *= 2.0f + i; + + if(cIt == cEnd && weightSum > maxWeightSum) + { + maxWeightSum = weightSum; + numClothsPerSM = i; + } + } + PX_ASSERT(numClothsPerSM); + + // update block size + uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31; + + // Workaround for nvbug 1709919: theoretically, register usage should allow us to launch at least + // mFactory.mMaxThreadsPerBlock threads, because that value corresponds to __launch_bounds__(maxThreadsPerBlock). + CUdevice device = 0; + checkSuccess(cuCtxGetDevice(&device)); + int registersPerBlock = 0, kernelRegisterCount = 0; + checkSuccess(cuDeviceGetAttribute(®istersPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device)); + checkSuccess(cuFuncGetAttribute(&kernelRegisterCount, CU_FUNC_ATTRIBUTE_NUM_REGS, mKernelFunction)); + numThreadsPerBlock = PxMin(numThreadsPerBlock, uint32_t(registersPerBlock / kernelRegisterCount)); + PX_ASSERT(numThreadsPerBlock >= 192); + + if(mFactory.mNumThreadsPerBlock != numThreadsPerBlock) + { + checkSuccess( + cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1)); + } + + // remember num cloths per SM in terms of max shared memory per block + mSharedMemoryLimit = + (mFactory.mContextManager->getSharedMemPerBlock() / numClothsPerSM) - mKernelSharedMemorySize; + } + + uint32_t maxSharedMemorySize = 0; + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + CuCloth& cloth = **cIt; + + uint32_t sharedMemorySize = cloth.mSharedMemorySize; + uint32_t positionsSize = cloth.mNumParticles * sizeof(PxVec4); + + uint32_t numSharedPositions = PxMin(2u, (mSharedMemoryLimit - sharedMemorySize) / positionsSize); + + maxSharedMemorySize = PxMax(maxSharedMemorySize, sharedMemorySize + numSharedPositions * positionsSize); + + IterationStateFactory factory(cloth, mFrameDt); + IterationState<Simd4f> state = factory.create<Simd4f>(cloth); + + mFrameData.pushBack(CuFrameData(cloth, numSharedPositions, state, mIterationDataBegin + mIterationData.size())); + + while(state.mRemainingIterations) + { + mIterationData.pushBack(CuIterationData(state)); + state.update(); + } + } + mSharedMemorySize = maxSharedMemorySize; + + // add dummy element because we read past the end + mIterationData.pushBack(CuIterationData()); + + if(&mIterationData.front() != iterationDataBegin) + { + // mIterationData grew, update pointers + iterationDataBegin = getDevicePointer(mIterationData); + + ptrdiff_t diff = (char*)iterationDataBegin - (char*)mIterationDataBegin; + CuPinnedVector<CuFrameData>::Type::Iterator fIt = mFrameData.begin(), fEnd; + for(fEnd = mFrameData.end(); fIt != fEnd; ++fIt) + reinterpret_cast<const char*&>(fIt->mIterationData) += diff; + + mIterationDataBegin = iterationDataBegin; + } +} + +void cloth::CuSolver::executeKernel() +{ + CuContextLock contextLock(mFactory); + +#if ENABLE_CUDA_PRINTF + if(cudaError result = cudaPrintfInit(mKernelModule)) + { + shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__, "cudaPrintfInit() returned %u.", + result); + } +#endif + + if(mClothDataDirty) + { + PX_ASSERT(mClothDataHostCopy.size() == mClothData.size()); + size_t numBytes = mClothData.size() * sizeof(CuClothData); + checkSuccess(cuMemcpyHtoDAsync(mClothData.begin().dev(), mClothDataHostCopy.begin(), numBytes, mStream)); + mClothDataDirty = false; + } + +#if 0 + static int frame = 0; + if(++frame == 100) + record(*this); +#endif + + // launch kernel + CUresult result = cuLaunchKernel(mKernelFunction, mCloths.size(), 1, 1, mFactory.mNumThreadsPerBlock, 1, 1, + mSharedMemorySize, mStream, 0, 0); + +#if ENABLE_CUDA_PRINTF + cudaPrintfDisplay(mKernelModule); + cudaPrintfEnd(); +#endif + +#if PX_DEBUG + // in debug builds check kernel result + checkSuccess(result); + checkSuccess(cuStreamSynchronize(mStream)); +#endif + + // mark the solver as being in an error state + // all cloth instances will be migrated to software + if(result != CUDA_SUCCESS) + mCudaError = true; +} + +void cloth::CuSolver::endFrame() +{ + CuPinnedVector<CuFrameData>::Type::ConstIterator fIt = mFrameData.begin(); + ClothVector::Iterator cIt, cEnd = mCloths.end(); + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++fIt) + { + CuCloth& cloth = **cIt; + + cloth.mHostParticlesDirty = false; + cloth.mDeviceParticlesDirty = false; + + cloth.mMotionConstraints.pop(); + cloth.mMotionConstraints.mHostCopy.resize(0); + + cloth.mSeparationConstraints.pop(); + cloth.mSeparationConstraints.mHostCopy.resize(0); + + if(!cloth.mTargetCollisionSpheres.empty()) + { + shdfnd::swap(cloth.mStartCollisionSpheres, cloth.mTargetCollisionSpheres); + cloth.mTargetCollisionSpheres.resize(0); + } + + if(!cloth.mTargetCollisionPlanes.empty()) + { + shdfnd::swap(cloth.mStartCollisionPlanes, cloth.mTargetCollisionPlanes); + cloth.mTargetCollisionPlanes.resize(0); + } + + if(!cloth.mTargetCollisionTriangles.empty()) + { + shdfnd::swap(cloth.mStartCollisionTriangles, cloth.mTargetCollisionTriangles); + cloth.mTargetCollisionTriangles.resize(0); + } + + for(uint32_t i = 0; i < 3; ++i) + { + float upper = fIt->mParticleBounds[i * 2 + 0]; + float negativeLower = fIt->mParticleBounds[i * 2 + 1]; + cloth.mParticleBoundsCenter[i] = (upper - negativeLower) * 0.5f; + cloth.mParticleBoundsHalfExtent[i] = (upper + negativeLower) * 0.5f; + } + + cloth.mSleepPassCounter = fIt->mSleepPassCounter; + cloth.mSleepTestCounter = fIt->mSleepTestCounter; + } + + interCollision(); + + PX_PROFILE_STOP_CROSSTHREAD("cloth::CuSolver::simulate", 0); +} + +void cloth::CuSolver::interCollision() +{ + if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f) + return; + + typedef SwInterCollision<Simd4f> SwInterCollision; + + // rebuild cloth instance array + mInterCollisionInstances.resize(0); + for(uint32_t i = 0, n = mCloths.size(); i < n; ++i) + { + CuCloth& cloth = *mCloths[i]; + + float elasticity = 1.0f / mFrameData[i].mNumIterations; + PX_ASSERT(!cloth.mHostParticlesDirty); + PxVec4* particles = cloth.mParticlesHostCopy.begin(); + uint32_t* indices = NULL, numIndices = cloth.mNumParticles; + if(!cloth.mSelfCollisionIndices.empty()) + { + indices = cloth.mSelfCollisionIndicesHost.begin(); + numIndices = uint32_t(cloth.mSelfCollisionIndices.size()); + } + + mInterCollisionInstances.pushBack(SwInterCollisionData( + particles, particles + cloth.mNumParticles, numIndices, indices, cloth.mTargetMotion, + cloth.mParticleBoundsCenter, cloth.mParticleBoundsHalfExtent, elasticity, cloth.mUserData)); + + cloth.mDeviceParticlesDirty = true; + } + + uint32_t requiredTempMemorySize = uint32_t( + SwInterCollision::estimateTemporaryMemory(&mInterCollisionInstances[0], mInterCollisionInstances.size())); + + // realloc temp memory if necessary + if(mInterCollisionScratchMemSize < requiredTempMemorySize) + { + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); + + mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem"); + mInterCollisionScratchMemSize = requiredTempMemorySize; + } + + SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize); + + // run inter-collision + SwInterCollision(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), mInterCollisionDistance, + mInterCollisionStiffness, mInterCollisionIterations, mInterCollisionFilter, allocator)(); +} + +cloth::CuSolver::ClothSolverTask::ClothSolverTask(FunctionPtr functionPtr, const char* name) +: mSolver(0), mFunctionPtr(functionPtr), mName(name) +{ +} + +void cloth::CuSolver::ClothSolverTask::runInternal() +{ + (mSolver->*mFunctionPtr)(); +} + +const char* cloth::CuSolver::ClothSolverTask::getName() const +{ + return mName; +} |