diff options
| author | mtamis <[email protected]> | 2017-02-15 16:06:25 +0100 |
|---|---|---|
| committer | mtamis <[email protected]> | 2017-02-15 16:06:25 +0100 |
| commit | 85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch) | |
| tree | 45f1bb20a45a300d1fef107e436cac95602a0e57 /NvCloth/src/cuda/CuSolver.cpp | |
| download | nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip | |
NvCloth library v1.0.0
Diffstat (limited to 'NvCloth/src/cuda/CuSolver.cpp')
| -rw-r--r-- | NvCloth/src/cuda/CuSolver.cpp | 677 |
1 files changed, 677 insertions, 0 deletions
diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp new file mode 100644 index 0000000..7927a42 --- /dev/null +++ b/NvCloth/src/cuda/CuSolver.cpp @@ -0,0 +1,677 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "CuSolver.h" +#include "CuCloth.h" +#include "../ClothImpl.h" +#include "CuFabric.h" +#include "CuFactory.h" +#include "CuSolverKernel.h" +#include "CuContextLock.h" +#include "CuCheckSuccess.h" +#include "../IterationState.h" +#include <PsSort.h> +#include <foundation/PxProfiler.h> + +#if NV_NVTX +#include "nvToolsExt.h" +#endif + +#define NV_CUPTI 0 + +#if NV_CUPTI +#pragma warning(disable : 4324) +#include "cupti_activity.h" +#include "cupti_metrics.h" +#include "cupti_driver_cbid.h" +#include <cstdio> + +namespace +{ +void CUPTIAPI bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) +{ + *buffer = (uint8_t*)PX_ALIGNED16_ALLOC(*size = 32 * 1024 * 1024); + *maxNumRecords = 0; +} + +void CUPTIAPI bufferCompleted(CUcontext context, uint32_t streamId, uint8_t* buffer, size_t /*size*/, size_t validSize) +{ + CUpti_Activity* record = NULL; + uint64_t totalTime = 0, numRecords = 0; + while (CUPTI_SUCCESS == cuptiActivityGetNextRecord(buffer, validSize, &record)) + { + if (record->kind != CUPTI_ACTIVITY_KIND_KERNEL) + continue; + + CUpti_ActivityKernel3* kernel = (CUpti_ActivityKernel3*)record; + if (strcmp(kernel->name, cloth::getKernelFunctionName())) + continue; + + totalTime += kernel->end - kernel->start; + ++numRecords; + } + + if (numRecords) + { + printf("%u kernel records, average runtime is %u ns\n", unsigned(numRecords), unsigned(totalTime / numRecords)); + } + + size_t dropped; + cuptiActivityGetNumDroppedRecords(context, streamId, &dropped); + if (dropped) + { + printf("Dropped %u activity records\n", unsigned(dropped)); + } + + PX_ALIGNED16_FREE(buffer); +} + +struct CuptiEventProfiler +{ + CuptiEventProfiler() : mActiveCycles(0), mNumEvents(0) + { + CUdevice device = 0; + cuCtxGetDevice(&device); + CUcontext context = 0; + cuCtxGetCurrent(&context); + cuptiEventGetIdFromName(device, "active_cycles", &mEventId); + cuptiEventGroupCreate(context, &mEventGroup, 0); + cuptiEventGroupAddEvent(mEventGroup, mEventId); + cuptiSubscribe(&mSubscriber, eventCallback, this); + cuptiEnableCallback(1, mSubscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel); + } + + ~CuptiEventProfiler() + { + cuptiUnsubscribe(mSubscriber); + cuptiEventGroupRemoveEvent(mEventGroup, mEventId); + cuptiEventGroupDestroy(mEventGroup); + if (mNumEvents) + { + printf("%u kernel events, average active cycles is %u\n", unsigned(mNumEvents), + unsigned(mActiveCycles / mNumEvents)); + } + } + + static void CUPTIAPI + eventCallback(void* profiler, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const void* cbInfo) + { + // This callback is enabled only for launch so we shouldn't see anything else. + NV_CLOTH_ASSERT(domain == CUPTI_CB_DOMAIN_DRIVER_API); + NV_CLOTH_ASSERT(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel); + + reinterpret_cast<CuptiEventProfiler*>(profiler) + ->eventCallback(reinterpret_cast<const CUpti_CallbackData*>(cbInfo)); + } + + void eventCallback(const CUpti_CallbackData* cbInfo) + { + // on entry, enable all the event groups being collected this pass, + // for metrics we collect for all instances of the event + if (cbInfo->callbackSite == CUPTI_API_ENTER) + { + cuCtxSynchronize(); + cuptiSetEventCollectionMode(cbInfo->context, CUPTI_EVENT_COLLECTION_MODE_KERNEL); + cuptiEventGroupEnable(mEventGroup); + } + + // on exit, read and record event values + if (cbInfo->callbackSite == CUPTI_API_EXIT) + { + cuCtxSynchronize(); + uint64_t activeCycles = 0; + size_t bytesRead = sizeof(activeCycles); + cuptiEventGroupReadEvent(mEventGroup, CUPTI_EVENT_READ_FLAG_NONE, mEventId, &bytesRead, &activeCycles); + cuptiEventGroupDisable(mEventGroup); + mActiveCycles += activeCycles; + ++mNumEvents; + } + } + + CUpti_SubscriberHandle mSubscriber; + CUpti_EventGroup mEventGroup; + CUpti_EventID mEventId; + uint64_t mActiveCycles; + uint64_t mNumEvents; +}; +} +#endif + +using namespace nv; +using namespace physx; + +const char* cloth::getKernelFunctionName() +{ + return "simulateCloths"; +} + +namespace +{ +const char* gKernelNames[] = { cloth::getKernelFunctionName(), }; + +// Note: gCuProfileZoneNames has a corresponding enum list (CuProfileZoneIds) in CuSolverKernel.h. +// Additions/deletions to gCuProfileZoneNames requires a similar action to CuProfileZoneIds. +const char* gCuProfileZoneNames[] = { + "cloth::CuSolverKernel::simulateKernel", "cloth::CuSolverKernel::integrateParticles", + "cloth::CuSolverKernel::accelerateParticles", "cloth::CuSolverKernel::applyWind", + "cloth::CuSolverKernel::constrainTether", "cloth::CuSolverKernel::solveFabric", + "cloth::CuSolverKernel::constrainMotion", "cloth::CuSolverKernel::constrainSeparation", + "cloth::CuSolverKernel::collideParticles", "cloth::CuSolverKernel::selfCollideParticles", + "cloth::CuSolverKernel::updateSleepState", "cloth::CuSolverKernel::simulateShared", + "cloth::CuSolverKernel::simulateStreamed", "cloth::CuSolverKernel::simulateGlobal", + "cloth::CuSolverKernel::solveConstraintSet", "cloth::CuCollision::buildAccleration", + "cloth::CuCollision::collideCapsules", "cloth::CuCollision::collideVirtualCapsules", + "cloth::CuCollision::collideContinuousCapsules", "cloth::CuCollision::collideConvexes", + "cloth::CuCollision::collideTriangles", "cloth::CuSelfCollision::buildAccleration", + "cloth::CuSelfCollision::collideParticles", +}; +} + +namespace +{ +template <typename T> +struct CuDeviceAllocator +{ + CuDeviceAllocator(CUcontext ctx) : mManager(ctx) + { + } + + T* allocate(size_t n) + { + CUdeviceptr result; + checkSuccess(cuMemAlloc(&result, n * sizeof(T))); + return reinterpret_cast<T*>(result); + } + + void deallocate(T* ptr) + { + checkSuccess(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr))); + } + + CUcontext mManager; +}; +} + +cloth::CuSolver::CuSolver(CuFactory& factory) +: CuContextLock(factory) +, mFactory(factory) +, mClothData(mFactory.mContext) +, mClothDataHostCopy(mFactory.mContext) +, mClothDataDirty(false) +, mFrameData(mFactory.mContext) +, mIterationData(mFactory.mContext) +, mIterationDataBegin(0) +, mFrameDt(0.0f) +, mSharedMemorySize(0) +, mSharedMemoryLimit(0) +, mStream(0) +, mKernelFunction(0) +, mKernelSharedMemorySize(0) +, mClothIndex(CuDeviceAllocator<uint32_t>(mFactory.mContext).allocate(1)) +, mInterCollisionDistance(0.0f) +, mInterCollisionStiffness(1.0f) +, mInterCollisionIterations(1) +, mInterCollisionFilter(nullptr) +, mInterCollisionScratchMem(NULL) +, mInterCollisionScratchMemSize(0) +, mSimulateNvtxRangeId(0) +, mProfileBuffer(0) +, mProfileBaseId(0) +, mCudaError(false) +{ + mFactory.mSolverCount++; + + NV_CLOTH_ASSERT(CuProfileZoneIds::NUMZONES == PX_ARRAY_SIZE(gCuProfileZoneNames)); + + if (mCudaError) + { + CuContextLock::release(); + return; + } + + checkSuccess(cuStreamCreate(&mStream, 0)); + checkSuccess(cuModuleGetFunction(&mKernelFunction, mFactory.mKernelModule, getKernelFunctionName())); + + // get amount of statically allocated shared memory + checkSuccess(cuFuncGetAttribute(&mKernelSharedMemorySize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, mKernelFunction)); + + // initialize cloth index + checkSuccess(cuMemsetD32(mClothIndex.dev(), 0, 1)); + +#if PX_CUPTI + // activity (measure kernel runtime in ns) + CUcontext context = 0; + cuCtxGetCurrent(&context); + cuptiActivityEnableContext(context, CUPTI_ACTIVITY_KIND_KERNEL); + cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted); + // event (measure kernel active cycles) + mCuptiEventProfiler = NV_CLOTH_NEW(CuptiEventProfiler); +#endif + + CuContextLock::release(); + + mSimulateProfileEventData = nullptr; +} + +cloth::CuSolver::~CuSolver() +{ + NV_CLOTH_ASSERT(mCloths.empty()); + + CuContextLock::acquire(); + +#if PX_CUPTI + cuptiActivityFlushAll(0); + cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL); + NV_CLOTH_DELETE((CuptiEventProfiler*)mCuptiEventProfiler); +#endif + + CuDeviceAllocator<uint32_t>(mFactory.mContext).deallocate(mClothIndex.get()); + + if (mStream) + checkSuccess(cuStreamDestroy(mStream)); + + if (mInterCollisionScratchMem) + NV_CLOTH_FREE(mInterCollisionScratchMem); + + mFactory.mSolverCount--; +} + +void cloth::CuSolver::updateKernelData() +{ + mKernelDataHost.mClothIndex = mClothIndex.get(); + mKernelDataHost.mClothData = mClothData.begin().get(); + mKernelDataHost.mFrameData = getDevicePointer(mFrameData); + + mKernelDataHost.mProfileBuffer = mProfileBuffer; + mKernelDataHost.mProfileBaseId = mProfileBaseId; + +} + +namespace +{ +struct ClothSimCostGreater +{ + bool operator()(const cloth::CuCloth* left, const cloth::CuCloth* right) const + { + return left->mNumParticles * left->mSolverFrequency > right->mNumParticles * right->mSolverFrequency; + } +}; +} + +void cloth::CuSolver::addCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end()); + + mCloths.pushBack(&cuCloth); + // trigger update of mClothData array + cuCloth.notifyChanged(); + + // sort cloth instances by size + shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator()); + + CuContextLock contextLock(mFactory); + + // resize containers and update kernel data + mClothDataHostCopy.resize(mCloths.size()); + mClothData.resize(mCloths.size()); + mFrameData.resize(mCloths.size()); + updateKernelData(); +} + +void cloth::CuSolver::removeCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + ClothVector::Iterator begin = mCloths.begin(), end = mCloths.end(); + ClothVector::Iterator it = mCloths.find(&cuCloth); + + if (it == end) + return; // not found + + uint32_t index = uint32_t(it - begin); + + mCloths.remove(index); + mClothDataHostCopy.remove(index); + mClothData.resize(mCloths.size()); + mClothDataDirty = true; +} + +bool cloth::CuSolver::beginSimulation(float dt) +{ + if (mCloths.empty()) + return false; + mFrameDt = dt; + beginFrame(); + return true; +} + +void cloth::CuSolver::simulateChunk(int idx) +{ + PX_UNUSED(idx); + NV_CLOTH_ASSERT(!mCloths.empty()); + NV_CLOTH_ASSERT(idx == 0); + CuSolver::executeKernel(); +} + +void cloth::CuSolver::endSimulation() +{ + NV_CLOTH_ASSERT(!mCloths.empty()); + CuSolver::endFrame(); +} + +int cloth::CuSolver::getSimulationChunkCount() const +{ + return 1; +} + +void cloth::CuSolver::beginFrame() +{ + CuContextLock contextLock(mFactory); + + mSimulateProfileEventData = NV_CLOTH_PROFILE_START_CROSSTHREAD("cloth::CuSolver::simulate", 0); + + CuIterationData* iterationDataBegin = mIterationData.empty() ? 0 : &mIterationData.front(); + + mFrameData.resize(0); + mIterationData.resize(0); + + // update cloth data + ClothVector::Iterator cIt, cEnd = mCloths.end(); + CuHostVector<CuClothData>::Type::Iterator dIt = mClothDataHostCopy.begin(); + for (cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++dIt) + mClothDataDirty |= (*cIt)->updateClothData(*dIt); + + if (mClothDataDirty) + { + /* find optimal number of cloths per SM */ + + CUdevice device = 0; + checkSuccess(cuCtxGetDevice(&device)); + int numSMs = 0; + checkSuccess(cuDeviceGetAttribute(&numSMs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); + int sharedMemoryPerBlock = 0; + checkSuccess(cuDeviceGetAttribute(&sharedMemoryPerBlock, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, device)); + + // at least 192 threads per block (e.g. CuCollision::buildAcceleration) + uint32_t maxClothsPerSM = std::min(mFactory.mMaxThreadsPerBlock / 192, uint32_t(mCloths.size() + numSMs - 1) / numSMs); + + // tuning parameters: relative performance per numSharedPositions + float weights[3] = { 0.4f, 0.8f, 1.0f }; //TODO check if these are the newest weights (APEX has different values) + + // try all possible number of cloths per SM and estimate performance + float maxWeightSum = 0.0f; + uint32_t numClothsPerSM = 0; + for (uint32_t i = 1; i <= maxClothsPerSM; ++i) + { + uint32_t sharedMemoryLimit = (sharedMemoryPerBlock / i) - mKernelSharedMemorySize; + + float weightSum = 0.0f; + for (cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + uint32_t sharedMemorySize = (*cIt)->mSharedMemorySize; + uint32_t positionsSize = (*cIt)->mNumParticles * sizeof(PxVec4); + + if (sharedMemorySize > sharedMemoryLimit) + break; + + uint32_t numSharedPositions = std::min(2u, (sharedMemoryLimit - sharedMemorySize) / positionsSize); + + weightSum += weights[numSharedPositions] * positionsSize; + } + // tuning parameter: inverse performance for running i cloths per SM + weightSum *= 2.0f + i; + + if (cIt == cEnd && weightSum > maxWeightSum) + { + maxWeightSum = weightSum; + numClothsPerSM = i; + } + } + NV_CLOTH_ASSERT(numClothsPerSM); + + // update block size + uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31; + if (mFactory.mNumThreadsPerBlock != numThreadsPerBlock) + { + checkSuccess( + cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1)); + } + + // remember num cloths per SM in terms of max shared memory per block + mSharedMemoryLimit = (sharedMemoryPerBlock / numClothsPerSM) - mKernelSharedMemorySize; + } + + uint32_t maxSharedMemorySize = 0; + for (cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + CuCloth& cloth = **cIt; + + uint32_t sharedMemorySize = cloth.mSharedMemorySize; + uint32_t positionsSize = cloth.mNumParticles * sizeof(PxVec4); + + uint32_t numSharedPositions = std::min(2u, (mSharedMemoryLimit - sharedMemorySize) / positionsSize); + + maxSharedMemorySize = std::max(maxSharedMemorySize, sharedMemorySize + numSharedPositions * positionsSize); + + IterationStateFactory factory(cloth, mFrameDt); + IterationState<Simd4f> state = factory.create<Simd4f>(cloth); + + mFrameData.pushBack(CuFrameData(cloth, numSharedPositions, state, mIterationDataBegin + mIterationData.size())); + + while (state.mRemainingIterations) + { + mIterationData.pushBack(CuIterationData(state)); + state.update(); + } + } + mSharedMemorySize = maxSharedMemorySize; + + // add dummy element because we read past the end + mIterationData.pushBack(CuIterationData()); + + if (&mIterationData.front() != iterationDataBegin) + { + // mIterationData grew, update pointers + iterationDataBegin = getDevicePointer(mIterationData); + + ptrdiff_t diff = (char*)iterationDataBegin - (char*)mIterationDataBegin; + CuHostVector<CuFrameData>::Type::Iterator fIt = mFrameData.begin(), fEnd; + for (fEnd = mFrameData.end(); fIt != fEnd; ++fIt) + reinterpret_cast<const char*&>(fIt->mIterationData) += diff; + + mIterationDataBegin = iterationDataBegin; + } +} + +void CUDA_CB cloth::CuSolver::KernelFinished(CUstream stream, CUresult status, void *userData) +{ + PX_UNUSED(stream); + PX_UNUSED(status); + //static_cast<CuSolver*>(userData)->mEndSimulationTask.removeReference(); + PX_UNUSED(userData); +} + +void cloth::CuSolver::executeKernel() +{ + CuContextLock contextLock(mFactory); + +/*#if PX_PROFILE //We don't have a gpu distapcher anymore + // Note: The profile buffer is valid only within the cuda launch context + void* profileBuffer = getDispatcher().getCurrentProfileBuffer(); + if (mProfileBuffer != profileBuffer && mProfileBaseId + 1) + { + mProfileBuffer = profileBuffer; + updateKernelData(); + } +#endif*/ + + if (mClothDataDirty) + { + NV_CLOTH_ASSERT(mClothDataHostCopy.size() == mClothData.size()); + size_t numBytes = mClothData.size() * sizeof(CuClothData); + checkSuccess(cuMemcpyHtoDAsync(mClothData.begin().dev(), mClothDataHostCopy.begin(), numBytes, mStream)); + mClothDataDirty = false; + } + +#if 0 + static int frame = 0; + if (++frame == 100) + record(*this); +#endif + + void* Arguments[] = {(void*)&mKernelDataHost}; + + // launch kernel + CUresult result = cuLaunchKernel(mKernelFunction, uint32_t(mCloths.size()), 1, 1, + mFactory.mNumThreadsPerBlock, 1, 1, mSharedMemorySize, mStream, Arguments, 0); + cuStreamAddCallback(mStream, &cloth::CuSolver::KernelFinished, this, 0); + + +#if PX_DEBUG + // in debug builds check kernel result + checkSuccess(result); + checkSuccess(cuStreamSynchronize(mStream)); +#endif + + // mark the solver as being in an error state + // all cloth instances should be migrated to software + if (result != CUDA_SUCCESS) + mCudaError = true; +} + +void cloth::CuSolver::endFrame() +{ + checkSuccess(cuStreamSynchronize(mStream)); + + CuHostVector<CuFrameData>::Type::ConstIterator fIt = mFrameData.begin(); + ClothVector::Iterator cIt, cEnd = mCloths.end(); + for (cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++fIt) + { + CuCloth& cloth = **cIt; + + cloth.mHostParticlesDirty = false; + cloth.mDeviceParticlesDirty = false; + + cloth.mMotionConstraints.pop(); + cloth.mMotionConstraints.mHostCopy.resize(0); + + cloth.mSeparationConstraints.pop(); + cloth.mSeparationConstraints.mHostCopy.resize(0); + + if (!cloth.mTargetCollisionSpheres.empty()) + { + shdfnd::swap(cloth.mStartCollisionSpheres, cloth.mTargetCollisionSpheres); + cloth.mTargetCollisionSpheres.resize(0); + } + + if (!cloth.mTargetCollisionPlanes.empty()) + { + shdfnd::swap(cloth.mStartCollisionPlanes, cloth.mTargetCollisionPlanes); + cloth.mTargetCollisionPlanes.resize(0); + } + + if (!cloth.mTargetCollisionTriangles.empty()) + { + shdfnd::swap(cloth.mStartCollisionTriangles, cloth.mTargetCollisionTriangles); + cloth.mTargetCollisionTriangles.resize(0); + } + + for (uint32_t i = 0; i < 3; ++i) + { + float upper = fIt->mParticleBounds[i * 2 + 0]; + float negativeLower = fIt->mParticleBounds[i * 2 + 1]; + cloth.mParticleBoundsCenter[i] = (upper - negativeLower) * 0.5f; + cloth.mParticleBoundsHalfExtent[i] = (upper + negativeLower) * 0.5f; + } + + cloth.mSleepPassCounter = fIt->mSleepPassCounter; + cloth.mSleepTestCounter = fIt->mSleepTestCounter; + } + + interCollision(); + + NV_CLOTH_PROFILE_STOP_CROSSTHREAD(mSimulateProfileEventData, "cloth::CuSolver::simulate", 0); +} + +void cloth::CuSolver::interCollision() +{ + if (!mInterCollisionIterations || mInterCollisionDistance == 0.0f) + return; + if (mInterCollisionFilter == nullptr) + { + NV_CLOTH_LOG_WARNING("Inter collision will not work unless an inter collision filter is set using Solver::setInterCollisionFilter."); + return; + } + + typedef SwInterCollision<Simd4f> SwInterCollision; + + // rebuild cloth instance array + mInterCollisionInstances.resize(0); + for (uint32_t i = 0, n = mCloths.size(); i < n; ++i) + { + CuCloth& cloth = *mCloths[i]; + + float elasticity = 1.0f / mFrameData[i].mNumIterations; + NV_CLOTH_ASSERT(!cloth.mHostParticlesDirty); + PxVec4* particles = cloth.mParticlesHostCopy.begin(); + uint32_t* indices = NULL, numIndices = cloth.mNumParticles; + if (!cloth.mSelfCollisionIndices.empty()) + { + indices = cloth.mSelfCollisionIndicesHost.begin(); + numIndices = uint32_t(cloth.mSelfCollisionIndices.size()); + } + + mInterCollisionInstances.pushBack(SwInterCollisionData( + particles, particles + cloth.mNumParticles, numIndices, indices, cloth.mTargetMotion, + cloth.mParticleBoundsCenter, cloth.mParticleBoundsHalfExtent, elasticity, cloth.mUserData)); + + cloth.mDeviceParticlesDirty = true; + } + + uint32_t requiredTempMemorySize = uint32_t(SwInterCollision::estimateTemporaryMemory( + &mInterCollisionInstances[0], uint32_t(mInterCollisionInstances.size()))); + + // realloc temp memory if necessary + if (mInterCollisionScratchMemSize < requiredTempMemorySize) + { + if (mInterCollisionScratchMem) + NV_CLOTH_FREE(mInterCollisionScratchMem); + + mInterCollisionScratchMem = NV_CLOTH_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem"); + mInterCollisionScratchMemSize = requiredTempMemorySize; + } + + SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize); + + // run inter-collision + SwInterCollision(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), mInterCollisionDistance, + mInterCollisionStiffness, mInterCollisionIterations, mInterCollisionFilter, allocator)(); +}
\ No newline at end of file |