aboutsummaryrefslogtreecommitdiff
path: root/NvCloth/src/cuda/CuSolver.cpp
diff options
context:
space:
mode:
authormtamis <[email protected]>2017-02-15 16:06:25 +0100
committermtamis <[email protected]>2017-02-15 16:06:25 +0100
commit85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch)
tree45f1bb20a45a300d1fef107e436cac95602a0e57 /NvCloth/src/cuda/CuSolver.cpp
downloadnvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz
nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip
NvCloth library v1.0.0
Diffstat (limited to 'NvCloth/src/cuda/CuSolver.cpp')
-rw-r--r--NvCloth/src/cuda/CuSolver.cpp677
1 files changed, 677 insertions, 0 deletions
diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp
new file mode 100644
index 0000000..7927a42
--- /dev/null
+++ b/NvCloth/src/cuda/CuSolver.cpp
@@ -0,0 +1,677 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuSolver.h"
+#include "CuCloth.h"
+#include "../ClothImpl.h"
+#include "CuFabric.h"
+#include "CuFactory.h"
+#include "CuSolverKernel.h"
+#include "CuContextLock.h"
+#include "CuCheckSuccess.h"
+#include "../IterationState.h"
+#include <PsSort.h>
+#include <foundation/PxProfiler.h>
+
+#if NV_NVTX
+#include "nvToolsExt.h"
+#endif
+
+#define NV_CUPTI 0
+
+#if NV_CUPTI
+#pragma warning(disable : 4324)
+#include "cupti_activity.h"
+#include "cupti_metrics.h"
+#include "cupti_driver_cbid.h"
+#include <cstdio>
+
+namespace
+{
+void CUPTIAPI bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords)
+{
+ *buffer = (uint8_t*)PX_ALIGNED16_ALLOC(*size = 32 * 1024 * 1024);
+ *maxNumRecords = 0;
+}
+
+void CUPTIAPI bufferCompleted(CUcontext context, uint32_t streamId, uint8_t* buffer, size_t /*size*/, size_t validSize)
+{
+ CUpti_Activity* record = NULL;
+ uint64_t totalTime = 0, numRecords = 0;
+ while (CUPTI_SUCCESS == cuptiActivityGetNextRecord(buffer, validSize, &record))
+ {
+ if (record->kind != CUPTI_ACTIVITY_KIND_KERNEL)
+ continue;
+
+ CUpti_ActivityKernel3* kernel = (CUpti_ActivityKernel3*)record;
+ if (strcmp(kernel->name, cloth::getKernelFunctionName()))
+ continue;
+
+ totalTime += kernel->end - kernel->start;
+ ++numRecords;
+ }
+
+ if (numRecords)
+ {
+ printf("%u kernel records, average runtime is %u ns\n", unsigned(numRecords), unsigned(totalTime / numRecords));
+ }
+
+ size_t dropped;
+ cuptiActivityGetNumDroppedRecords(context, streamId, &dropped);
+ if (dropped)
+ {
+ printf("Dropped %u activity records\n", unsigned(dropped));
+ }
+
+ PX_ALIGNED16_FREE(buffer);
+}
+
+struct CuptiEventProfiler
+{
+ CuptiEventProfiler() : mActiveCycles(0), mNumEvents(0)
+ {
+ CUdevice device = 0;
+ cuCtxGetDevice(&device);
+ CUcontext context = 0;
+ cuCtxGetCurrent(&context);
+ cuptiEventGetIdFromName(device, "active_cycles", &mEventId);
+ cuptiEventGroupCreate(context, &mEventGroup, 0);
+ cuptiEventGroupAddEvent(mEventGroup, mEventId);
+ cuptiSubscribe(&mSubscriber, eventCallback, this);
+ cuptiEnableCallback(1, mSubscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel);
+ }
+
+ ~CuptiEventProfiler()
+ {
+ cuptiUnsubscribe(mSubscriber);
+ cuptiEventGroupRemoveEvent(mEventGroup, mEventId);
+ cuptiEventGroupDestroy(mEventGroup);
+ if (mNumEvents)
+ {
+ printf("%u kernel events, average active cycles is %u\n", unsigned(mNumEvents),
+ unsigned(mActiveCycles / mNumEvents));
+ }
+ }
+
+ static void CUPTIAPI
+ eventCallback(void* profiler, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const void* cbInfo)
+ {
+ // This callback is enabled only for launch so we shouldn't see anything else.
+ NV_CLOTH_ASSERT(domain == CUPTI_CB_DOMAIN_DRIVER_API);
+ NV_CLOTH_ASSERT(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel);
+
+ reinterpret_cast<CuptiEventProfiler*>(profiler)
+ ->eventCallback(reinterpret_cast<const CUpti_CallbackData*>(cbInfo));
+ }
+
+ void eventCallback(const CUpti_CallbackData* cbInfo)
+ {
+ // on entry, enable all the event groups being collected this pass,
+ // for metrics we collect for all instances of the event
+ if (cbInfo->callbackSite == CUPTI_API_ENTER)
+ {
+ cuCtxSynchronize();
+ cuptiSetEventCollectionMode(cbInfo->context, CUPTI_EVENT_COLLECTION_MODE_KERNEL);
+ cuptiEventGroupEnable(mEventGroup);
+ }
+
+ // on exit, read and record event values
+ if (cbInfo->callbackSite == CUPTI_API_EXIT)
+ {
+ cuCtxSynchronize();
+ uint64_t activeCycles = 0;
+ size_t bytesRead = sizeof(activeCycles);
+ cuptiEventGroupReadEvent(mEventGroup, CUPTI_EVENT_READ_FLAG_NONE, mEventId, &bytesRead, &activeCycles);
+ cuptiEventGroupDisable(mEventGroup);
+ mActiveCycles += activeCycles;
+ ++mNumEvents;
+ }
+ }
+
+ CUpti_SubscriberHandle mSubscriber;
+ CUpti_EventGroup mEventGroup;
+ CUpti_EventID mEventId;
+ uint64_t mActiveCycles;
+ uint64_t mNumEvents;
+};
+}
+#endif
+
+using namespace nv;
+using namespace physx;
+
+const char* cloth::getKernelFunctionName()
+{
+ return "simulateCloths";
+}
+
+namespace
+{
+const char* gKernelNames[] = { cloth::getKernelFunctionName(), };
+
+// Note: gCuProfileZoneNames has a corresponding enum list (CuProfileZoneIds) in CuSolverKernel.h.
+// Additions/deletions to gCuProfileZoneNames requires a similar action to CuProfileZoneIds.
+const char* gCuProfileZoneNames[] = {
+ "cloth::CuSolverKernel::simulateKernel", "cloth::CuSolverKernel::integrateParticles",
+ "cloth::CuSolverKernel::accelerateParticles", "cloth::CuSolverKernel::applyWind",
+ "cloth::CuSolverKernel::constrainTether", "cloth::CuSolverKernel::solveFabric",
+ "cloth::CuSolverKernel::constrainMotion", "cloth::CuSolverKernel::constrainSeparation",
+ "cloth::CuSolverKernel::collideParticles", "cloth::CuSolverKernel::selfCollideParticles",
+ "cloth::CuSolverKernel::updateSleepState", "cloth::CuSolverKernel::simulateShared",
+ "cloth::CuSolverKernel::simulateStreamed", "cloth::CuSolverKernel::simulateGlobal",
+ "cloth::CuSolverKernel::solveConstraintSet", "cloth::CuCollision::buildAccleration",
+ "cloth::CuCollision::collideCapsules", "cloth::CuCollision::collideVirtualCapsules",
+ "cloth::CuCollision::collideContinuousCapsules", "cloth::CuCollision::collideConvexes",
+ "cloth::CuCollision::collideTriangles", "cloth::CuSelfCollision::buildAccleration",
+ "cloth::CuSelfCollision::collideParticles",
+};
+}
+
+namespace
+{
+template <typename T>
+struct CuDeviceAllocator
+{
+ CuDeviceAllocator(CUcontext ctx) : mManager(ctx)
+ {
+ }
+
+ T* allocate(size_t n)
+ {
+ CUdeviceptr result;
+ checkSuccess(cuMemAlloc(&result, n * sizeof(T)));
+ return reinterpret_cast<T*>(result);
+ }
+
+ void deallocate(T* ptr)
+ {
+ checkSuccess(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
+ }
+
+ CUcontext mManager;
+};
+}
+
+cloth::CuSolver::CuSolver(CuFactory& factory)
+: CuContextLock(factory)
+, mFactory(factory)
+, mClothData(mFactory.mContext)
+, mClothDataHostCopy(mFactory.mContext)
+, mClothDataDirty(false)
+, mFrameData(mFactory.mContext)
+, mIterationData(mFactory.mContext)
+, mIterationDataBegin(0)
+, mFrameDt(0.0f)
+, mSharedMemorySize(0)
+, mSharedMemoryLimit(0)
+, mStream(0)
+, mKernelFunction(0)
+, mKernelSharedMemorySize(0)
+, mClothIndex(CuDeviceAllocator<uint32_t>(mFactory.mContext).allocate(1))
+, mInterCollisionDistance(0.0f)
+, mInterCollisionStiffness(1.0f)
+, mInterCollisionIterations(1)
+, mInterCollisionFilter(nullptr)
+, mInterCollisionScratchMem(NULL)
+, mInterCollisionScratchMemSize(0)
+, mSimulateNvtxRangeId(0)
+, mProfileBuffer(0)
+, mProfileBaseId(0)
+, mCudaError(false)
+{
+ mFactory.mSolverCount++;
+
+ NV_CLOTH_ASSERT(CuProfileZoneIds::NUMZONES == PX_ARRAY_SIZE(gCuProfileZoneNames));
+
+ if (mCudaError)
+ {
+ CuContextLock::release();
+ return;
+ }
+
+ checkSuccess(cuStreamCreate(&mStream, 0));
+ checkSuccess(cuModuleGetFunction(&mKernelFunction, mFactory.mKernelModule, getKernelFunctionName()));
+
+ // get amount of statically allocated shared memory
+ checkSuccess(cuFuncGetAttribute(&mKernelSharedMemorySize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, mKernelFunction));
+
+ // initialize cloth index
+ checkSuccess(cuMemsetD32(mClothIndex.dev(), 0, 1));
+
+#if PX_CUPTI
+ // activity (measure kernel runtime in ns)
+ CUcontext context = 0;
+ cuCtxGetCurrent(&context);
+ cuptiActivityEnableContext(context, CUPTI_ACTIVITY_KIND_KERNEL);
+ cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted);
+ // event (measure kernel active cycles)
+ mCuptiEventProfiler = NV_CLOTH_NEW(CuptiEventProfiler);
+#endif
+
+ CuContextLock::release();
+
+ mSimulateProfileEventData = nullptr;
+}
+
+cloth::CuSolver::~CuSolver()
+{
+ NV_CLOTH_ASSERT(mCloths.empty());
+
+ CuContextLock::acquire();
+
+#if PX_CUPTI
+ cuptiActivityFlushAll(0);
+ cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL);
+ NV_CLOTH_DELETE((CuptiEventProfiler*)mCuptiEventProfiler);
+#endif
+
+ CuDeviceAllocator<uint32_t>(mFactory.mContext).deallocate(mClothIndex.get());
+
+ if (mStream)
+ checkSuccess(cuStreamDestroy(mStream));
+
+ if (mInterCollisionScratchMem)
+ NV_CLOTH_FREE(mInterCollisionScratchMem);
+
+ mFactory.mSolverCount--;
+}
+
+void cloth::CuSolver::updateKernelData()
+{
+ mKernelDataHost.mClothIndex = mClothIndex.get();
+ mKernelDataHost.mClothData = mClothData.begin().get();
+ mKernelDataHost.mFrameData = getDevicePointer(mFrameData);
+
+ mKernelDataHost.mProfileBuffer = mProfileBuffer;
+ mKernelDataHost.mProfileBaseId = mProfileBaseId;
+
+}
+
+namespace
+{
+struct ClothSimCostGreater
+{
+ bool operator()(const cloth::CuCloth* left, const cloth::CuCloth* right) const
+ {
+ return left->mNumParticles * left->mSolverFrequency > right->mNumParticles * right->mSolverFrequency;
+ }
+};
+}
+
+void cloth::CuSolver::addCloth(Cloth* cloth)
+{
+ CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth;
+
+ NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end());
+
+ mCloths.pushBack(&cuCloth);
+ // trigger update of mClothData array
+ cuCloth.notifyChanged();
+
+ // sort cloth instances by size
+ shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator());
+
+ CuContextLock contextLock(mFactory);
+
+ // resize containers and update kernel data
+ mClothDataHostCopy.resize(mCloths.size());
+ mClothData.resize(mCloths.size());
+ mFrameData.resize(mCloths.size());
+ updateKernelData();
+}
+
+void cloth::CuSolver::removeCloth(Cloth* cloth)
+{
+ CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth;
+
+ ClothVector::Iterator begin = mCloths.begin(), end = mCloths.end();
+ ClothVector::Iterator it = mCloths.find(&cuCloth);
+
+ if (it == end)
+ return; // not found
+
+ uint32_t index = uint32_t(it - begin);
+
+ mCloths.remove(index);
+ mClothDataHostCopy.remove(index);
+ mClothData.resize(mCloths.size());
+ mClothDataDirty = true;
+}
+
+bool cloth::CuSolver::beginSimulation(float dt)
+{
+ if (mCloths.empty())
+ return false;
+ mFrameDt = dt;
+ beginFrame();
+ return true;
+}
+
+void cloth::CuSolver::simulateChunk(int idx)
+{
+ PX_UNUSED(idx);
+ NV_CLOTH_ASSERT(!mCloths.empty());
+ NV_CLOTH_ASSERT(idx == 0);
+ CuSolver::executeKernel();
+}
+
+void cloth::CuSolver::endSimulation()
+{
+ NV_CLOTH_ASSERT(!mCloths.empty());
+ CuSolver::endFrame();
+}
+
+int cloth::CuSolver::getSimulationChunkCount() const
+{
+ return 1;
+}
+
+void cloth::CuSolver::beginFrame()
+{
+ CuContextLock contextLock(mFactory);
+
+ mSimulateProfileEventData = NV_CLOTH_PROFILE_START_CROSSTHREAD("cloth::CuSolver::simulate", 0);
+
+ CuIterationData* iterationDataBegin = mIterationData.empty() ? 0 : &mIterationData.front();
+
+ mFrameData.resize(0);
+ mIterationData.resize(0);
+
+ // update cloth data
+ ClothVector::Iterator cIt, cEnd = mCloths.end();
+ CuHostVector<CuClothData>::Type::Iterator dIt = mClothDataHostCopy.begin();
+ for (cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++dIt)
+ mClothDataDirty |= (*cIt)->updateClothData(*dIt);
+
+ if (mClothDataDirty)
+ {
+ /* find optimal number of cloths per SM */
+
+ CUdevice device = 0;
+ checkSuccess(cuCtxGetDevice(&device));
+ int numSMs = 0;
+ checkSuccess(cuDeviceGetAttribute(&numSMs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+ int sharedMemoryPerBlock = 0;
+ checkSuccess(cuDeviceGetAttribute(&sharedMemoryPerBlock, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, device));
+
+ // at least 192 threads per block (e.g. CuCollision::buildAcceleration)
+ uint32_t maxClothsPerSM = std::min(mFactory.mMaxThreadsPerBlock / 192, uint32_t(mCloths.size() + numSMs - 1) / numSMs);
+
+ // tuning parameters: relative performance per numSharedPositions
+ float weights[3] = { 0.4f, 0.8f, 1.0f }; //TODO check if these are the newest weights (APEX has different values)
+
+ // try all possible number of cloths per SM and estimate performance
+ float maxWeightSum = 0.0f;
+ uint32_t numClothsPerSM = 0;
+ for (uint32_t i = 1; i <= maxClothsPerSM; ++i)
+ {
+ uint32_t sharedMemoryLimit = (sharedMemoryPerBlock / i) - mKernelSharedMemorySize;
+
+ float weightSum = 0.0f;
+ for (cIt = mCloths.begin(); cIt != cEnd; ++cIt)
+ {
+ uint32_t sharedMemorySize = (*cIt)->mSharedMemorySize;
+ uint32_t positionsSize = (*cIt)->mNumParticles * sizeof(PxVec4);
+
+ if (sharedMemorySize > sharedMemoryLimit)
+ break;
+
+ uint32_t numSharedPositions = std::min(2u, (sharedMemoryLimit - sharedMemorySize) / positionsSize);
+
+ weightSum += weights[numSharedPositions] * positionsSize;
+ }
+ // tuning parameter: inverse performance for running i cloths per SM
+ weightSum *= 2.0f + i;
+
+ if (cIt == cEnd && weightSum > maxWeightSum)
+ {
+ maxWeightSum = weightSum;
+ numClothsPerSM = i;
+ }
+ }
+ NV_CLOTH_ASSERT(numClothsPerSM);
+
+ // update block size
+ uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31;
+ if (mFactory.mNumThreadsPerBlock != numThreadsPerBlock)
+ {
+ checkSuccess(
+ cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1));
+ }
+
+ // remember num cloths per SM in terms of max shared memory per block
+ mSharedMemoryLimit = (sharedMemoryPerBlock / numClothsPerSM) - mKernelSharedMemorySize;
+ }
+
+ uint32_t maxSharedMemorySize = 0;
+ for (cIt = mCloths.begin(); cIt != cEnd; ++cIt)
+ {
+ CuCloth& cloth = **cIt;
+
+ uint32_t sharedMemorySize = cloth.mSharedMemorySize;
+ uint32_t positionsSize = cloth.mNumParticles * sizeof(PxVec4);
+
+ uint32_t numSharedPositions = std::min(2u, (mSharedMemoryLimit - sharedMemorySize) / positionsSize);
+
+ maxSharedMemorySize = std::max(maxSharedMemorySize, sharedMemorySize + numSharedPositions * positionsSize);
+
+ IterationStateFactory factory(cloth, mFrameDt);
+ IterationState<Simd4f> state = factory.create<Simd4f>(cloth);
+
+ mFrameData.pushBack(CuFrameData(cloth, numSharedPositions, state, mIterationDataBegin + mIterationData.size()));
+
+ while (state.mRemainingIterations)
+ {
+ mIterationData.pushBack(CuIterationData(state));
+ state.update();
+ }
+ }
+ mSharedMemorySize = maxSharedMemorySize;
+
+ // add dummy element because we read past the end
+ mIterationData.pushBack(CuIterationData());
+
+ if (&mIterationData.front() != iterationDataBegin)
+ {
+ // mIterationData grew, update pointers
+ iterationDataBegin = getDevicePointer(mIterationData);
+
+ ptrdiff_t diff = (char*)iterationDataBegin - (char*)mIterationDataBegin;
+ CuHostVector<CuFrameData>::Type::Iterator fIt = mFrameData.begin(), fEnd;
+ for (fEnd = mFrameData.end(); fIt != fEnd; ++fIt)
+ reinterpret_cast<const char*&>(fIt->mIterationData) += diff;
+
+ mIterationDataBegin = iterationDataBegin;
+ }
+}
+
+void CUDA_CB cloth::CuSolver::KernelFinished(CUstream stream, CUresult status, void *userData)
+{
+ PX_UNUSED(stream);
+ PX_UNUSED(status);
+ //static_cast<CuSolver*>(userData)->mEndSimulationTask.removeReference();
+ PX_UNUSED(userData);
+}
+
+void cloth::CuSolver::executeKernel()
+{
+ CuContextLock contextLock(mFactory);
+
+/*#if PX_PROFILE //We don't have a gpu distapcher anymore
+ // Note: The profile buffer is valid only within the cuda launch context
+ void* profileBuffer = getDispatcher().getCurrentProfileBuffer();
+ if (mProfileBuffer != profileBuffer && mProfileBaseId + 1)
+ {
+ mProfileBuffer = profileBuffer;
+ updateKernelData();
+ }
+#endif*/
+
+ if (mClothDataDirty)
+ {
+ NV_CLOTH_ASSERT(mClothDataHostCopy.size() == mClothData.size());
+ size_t numBytes = mClothData.size() * sizeof(CuClothData);
+ checkSuccess(cuMemcpyHtoDAsync(mClothData.begin().dev(), mClothDataHostCopy.begin(), numBytes, mStream));
+ mClothDataDirty = false;
+ }
+
+#if 0
+ static int frame = 0;
+ if (++frame == 100)
+ record(*this);
+#endif
+
+ void* Arguments[] = {(void*)&mKernelDataHost};
+
+ // launch kernel
+ CUresult result = cuLaunchKernel(mKernelFunction, uint32_t(mCloths.size()), 1, 1,
+ mFactory.mNumThreadsPerBlock, 1, 1, mSharedMemorySize, mStream, Arguments, 0);
+ cuStreamAddCallback(mStream, &cloth::CuSolver::KernelFinished, this, 0);
+
+
+#if PX_DEBUG
+ // in debug builds check kernel result
+ checkSuccess(result);
+ checkSuccess(cuStreamSynchronize(mStream));
+#endif
+
+ // mark the solver as being in an error state
+ // all cloth instances should be migrated to software
+ if (result != CUDA_SUCCESS)
+ mCudaError = true;
+}
+
+void cloth::CuSolver::endFrame()
+{
+ checkSuccess(cuStreamSynchronize(mStream));
+
+ CuHostVector<CuFrameData>::Type::ConstIterator fIt = mFrameData.begin();
+ ClothVector::Iterator cIt, cEnd = mCloths.end();
+ for (cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++fIt)
+ {
+ CuCloth& cloth = **cIt;
+
+ cloth.mHostParticlesDirty = false;
+ cloth.mDeviceParticlesDirty = false;
+
+ cloth.mMotionConstraints.pop();
+ cloth.mMotionConstraints.mHostCopy.resize(0);
+
+ cloth.mSeparationConstraints.pop();
+ cloth.mSeparationConstraints.mHostCopy.resize(0);
+
+ if (!cloth.mTargetCollisionSpheres.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionSpheres, cloth.mTargetCollisionSpheres);
+ cloth.mTargetCollisionSpheres.resize(0);
+ }
+
+ if (!cloth.mTargetCollisionPlanes.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionPlanes, cloth.mTargetCollisionPlanes);
+ cloth.mTargetCollisionPlanes.resize(0);
+ }
+
+ if (!cloth.mTargetCollisionTriangles.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionTriangles, cloth.mTargetCollisionTriangles);
+ cloth.mTargetCollisionTriangles.resize(0);
+ }
+
+ for (uint32_t i = 0; i < 3; ++i)
+ {
+ float upper = fIt->mParticleBounds[i * 2 + 0];
+ float negativeLower = fIt->mParticleBounds[i * 2 + 1];
+ cloth.mParticleBoundsCenter[i] = (upper - negativeLower) * 0.5f;
+ cloth.mParticleBoundsHalfExtent[i] = (upper + negativeLower) * 0.5f;
+ }
+
+ cloth.mSleepPassCounter = fIt->mSleepPassCounter;
+ cloth.mSleepTestCounter = fIt->mSleepTestCounter;
+ }
+
+ interCollision();
+
+ NV_CLOTH_PROFILE_STOP_CROSSTHREAD(mSimulateProfileEventData, "cloth::CuSolver::simulate", 0);
+}
+
+void cloth::CuSolver::interCollision()
+{
+ if (!mInterCollisionIterations || mInterCollisionDistance == 0.0f)
+ return;
+ if (mInterCollisionFilter == nullptr)
+ {
+ NV_CLOTH_LOG_WARNING("Inter collision will not work unless an inter collision filter is set using Solver::setInterCollisionFilter.");
+ return;
+ }
+
+ typedef SwInterCollision<Simd4f> SwInterCollision;
+
+ // rebuild cloth instance array
+ mInterCollisionInstances.resize(0);
+ for (uint32_t i = 0, n = mCloths.size(); i < n; ++i)
+ {
+ CuCloth& cloth = *mCloths[i];
+
+ float elasticity = 1.0f / mFrameData[i].mNumIterations;
+ NV_CLOTH_ASSERT(!cloth.mHostParticlesDirty);
+ PxVec4* particles = cloth.mParticlesHostCopy.begin();
+ uint32_t* indices = NULL, numIndices = cloth.mNumParticles;
+ if (!cloth.mSelfCollisionIndices.empty())
+ {
+ indices = cloth.mSelfCollisionIndicesHost.begin();
+ numIndices = uint32_t(cloth.mSelfCollisionIndices.size());
+ }
+
+ mInterCollisionInstances.pushBack(SwInterCollisionData(
+ particles, particles + cloth.mNumParticles, numIndices, indices, cloth.mTargetMotion,
+ cloth.mParticleBoundsCenter, cloth.mParticleBoundsHalfExtent, elasticity, cloth.mUserData));
+
+ cloth.mDeviceParticlesDirty = true;
+ }
+
+ uint32_t requiredTempMemorySize = uint32_t(SwInterCollision::estimateTemporaryMemory(
+ &mInterCollisionInstances[0], uint32_t(mInterCollisionInstances.size())));
+
+ // realloc temp memory if necessary
+ if (mInterCollisionScratchMemSize < requiredTempMemorySize)
+ {
+ if (mInterCollisionScratchMem)
+ NV_CLOTH_FREE(mInterCollisionScratchMem);
+
+ mInterCollisionScratchMem = NV_CLOTH_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem");
+ mInterCollisionScratchMemSize = requiredTempMemorySize;
+ }
+
+ SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize);
+
+ // run inter-collision
+ SwInterCollision(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), mInterCollisionDistance,
+ mInterCollisionStiffness, mInterCollisionIterations, mInterCollisionFilter, allocator)();
+} \ No newline at end of file