aboutsummaryrefslogtreecommitdiff
path: root/PhysX_3.4/Source/LowLevelCloth/src/windows
diff options
context:
space:
mode:
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src/windows')
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h225
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h45
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp511
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h216
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp83
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp238
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h191
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h1505
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp54
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h57
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h216
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h258
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp197
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h102
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp398
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h107
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h51
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h132
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h472
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp556
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h180
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h57
22 files changed, 5851 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h
new file mode 100644
index 00000000..4f02de76
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h
@@ -0,0 +1,225 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxMemory.h"
+
+#include "SwFactory.h"
+#include "SwFabric.h"
+#include "SwCloth.h"
+
+#include "ClothImpl.h"
+#include "ClothBase.h"
+
+namespace physx
+{
+namespace cloth
+{
+class DxFactory;
+class CuFactory;
+
+// make range from vector
+template <typename T, typename A>
+Range<T> makeRange(shdfnd::Array<T, A>& vec)
+{
+ T* ptr = vec.empty() ? 0 : vec.begin();
+ return Range<T>(ptr, ptr + vec.size());
+}
+
+template <typename T, typename A>
+Range<const T> makeRange(const shdfnd::Array<T, A>& vec)
+{
+ const T* ptr = vec.empty() ? 0 : vec.begin();
+ return Range<const T>(ptr, ptr + vec.size());
+}
+
+// fabric conversion
+template <typename SrcClothType, typename DstFactoryType>
+typename DstFactoryType::FabricType* convertFabric(const SrcClothType& srcFabric, DstFactoryType& dstFactory)
+{
+ typedef typename DstFactoryType::FabricType DstFabricType;
+
+ // see if dstFactory already has a Fabric with this id
+ DstFabricType* const* fIt = dstFactory.mFabrics.begin();
+ DstFabricType* const* fEnd = dstFactory.mFabrics.end();
+ for(; fIt != fEnd; ++fIt)
+ if((*fIt)->mId == srcFabric.mId)
+ return *fIt; // found id, return existing fabric
+
+ // fabric does not exist so create a new one
+ Vector<uint32_t>::Type phases(srcFabric.getNumPhases());
+ Vector<uint32_t>::Type sets(srcFabric.getNumSets());
+ Vector<float>::Type restvalues(srcFabric.getNumRestvalues());
+ Vector<uint32_t>::Type indices(srcFabric.getNumIndices());
+ Vector<uint32_t>::Type anchors(srcFabric.getNumTethers());
+ Vector<float>::Type tetherLengths(srcFabric.getNumTethers());
+ Vector<uint32_t>::Type triangles(srcFabric.getNumTriangles() * 3);
+
+ Range<uint32_t> phaseRange = makeRange(phases);
+ Range<float> restvalueRange = makeRange(restvalues);
+ Range<uint32_t> setRange = makeRange(sets);
+ Range<uint32_t> indexRange = makeRange(indices);
+ Range<uint32_t> anchorRange = makeRange(anchors);
+ Range<float> lengthRange = makeRange(tetherLengths);
+ Range<uint32_t> triangleRange = makeRange(triangles);
+
+ srcFabric.mFactory.extractFabricData(srcFabric, phaseRange, setRange, restvalueRange, indexRange, anchorRange,
+ lengthRange, triangleRange);
+
+ DstFabricType* dstFabric =
+ static_cast<DstFabricType*>(dstFactory.createFabric(srcFabric.mNumParticles, phaseRange, setRange, restvalueRange,
+ indexRange, anchorRange, lengthRange, triangleRange));
+
+ // give new fabric the same id as the source so it can be matched
+ dstFabric->mId = srcFabric.mId;
+
+ return dstFabric;
+}
+
+inline Range<const PhaseConfig> getPhaseConfigs(const SwCloth& cloth)
+{
+ return makeRange(cloth.mPhaseConfigs);
+}
+inline void setPhaseConfigs(SwCloth& cloth, Range<const PhaseConfig> phaseConfigs)
+{
+ cloth.mPhaseConfigs.assign(phaseConfigs.begin(), phaseConfigs.end());
+}
+inline Range<const PxVec4> getParticleAccelerations(const SwCloth& cloth)
+{
+ return makeRange(cloth.mParticleAccelerations);
+}
+inline Range<const uint32_t> getSelfCollisionIndices(const SwCloth& cloth)
+{
+ return makeRange(cloth.mSelfCollisionIndices);
+}
+
+// cloth conversion
+template <typename DstFactoryType, typename SrcImplType>
+typename DstFactoryType::ImplType* convertCloth(DstFactoryType& dstFactory, const SrcImplType& srcImpl)
+{
+ typedef typename DstFactoryType::FabricType DstFabricType;
+ typedef typename DstFactoryType::ImplType DstImplType;
+ typedef typename DstImplType::ClothType DstClothType;
+ typedef typename SrcImplType::ClothType SrcClothType;
+
+ const SrcClothType& srcCloth = srcImpl.mCloth;
+ const Factory& srcFactory = srcCloth.mFactory;
+
+ typename DstClothType::ContextLockType dstLock(dstFactory);
+ typename SrcClothType::ContextLockType srcLock(srcCloth.mFactory);
+
+ // particles
+ MappedRange<const PxVec4> curParticles = srcImpl.getCurrentParticles();
+
+ // fabric
+ DstFabricType& dstFabric = *convertFabric(srcCloth.mFabric, dstFactory);
+
+ // create new cloth
+ DstImplType* dstImpl = static_cast<DstImplType*>(dstFactory.createCloth(curParticles, dstFabric));
+ DstClothType& dstCloth = dstImpl->mCloth;
+
+ // copy across common parameters
+ copy(dstCloth, srcCloth);
+
+ // copy across previous particles
+ MappedRange<const PxVec4> prevParticles = srcImpl.getPreviousParticles();
+ PxMemCopy(dstImpl->getPreviousParticles().begin(), prevParticles.begin(), prevParticles.size() * sizeof(PxVec4));
+
+ // copy across transformed phase configs
+ setPhaseConfigs(dstCloth, getPhaseConfigs(srcCloth));
+
+ // collision data
+ Vector<PxVec4>::Type spheres(srcImpl.getNumSpheres(), PxVec4(0.0f));
+ PxVec4* spherePtr = spheres.empty() ? 0 : &spheres.front();
+ Range<PxVec4> sphereRange(spherePtr, spherePtr + spheres.size());
+ Vector<uint32_t>::Type capsules(srcImpl.getNumCapsules() * 2);
+ Range<uint32_t> capsuleRange = makeRange(capsules);
+ Vector<PxVec4>::Type planes(srcImpl.getNumPlanes(), PxVec4(0.0f));
+ PxVec4* planePtr = planes.empty() ? 0 : &planes.front();
+ Range<PxVec4> planeRange(planePtr, planePtr + planes.size());
+ Vector<uint32_t>::Type convexes(srcImpl.getNumConvexes());
+ Range<uint32_t> convexRange = makeRange(convexes);
+ Vector<PxVec3>::Type triangles(srcImpl.getNumTriangles() * 3, PxVec3(0.0f));
+ PxVec3* trianglePtr = triangles.empty() ? 0 : &triangles.front();
+ Range<PxVec3> triangleRange(trianglePtr, trianglePtr + triangles.size());
+
+ srcFactory.extractCollisionData(srcImpl, sphereRange, capsuleRange, planeRange, convexRange, triangleRange);
+ dstImpl->setSpheres(sphereRange, 0, 0);
+ dstImpl->setCapsules(capsuleRange, 0, 0);
+ dstImpl->setPlanes(planeRange, 0, 0);
+ dstImpl->setConvexes(convexRange, 0, 0);
+ dstImpl->setTriangles(triangleRange, 0, 0);
+
+ // motion constraints, copy directly into new cloth buffer
+ if(srcImpl.getNumMotionConstraints())
+ srcFactory.extractMotionConstraints(srcImpl, dstImpl->getMotionConstraints());
+
+ // separation constraints, copy directly into new cloth buffer
+ if(srcImpl.getNumSeparationConstraints())
+ srcFactory.extractSeparationConstraints(srcImpl, dstImpl->getSeparationConstraints());
+
+ // particle accelerations
+ if(srcImpl.getNumParticleAccelerations())
+ {
+ Range<const PxVec4> accelerations = getParticleAccelerations(srcCloth);
+ PxMemCopy(dstImpl->getParticleAccelerations().begin(), accelerations.begin(),
+ accelerations.size() * sizeof(PxVec4));
+ }
+
+ // self-collision indices
+ dstImpl->setSelfCollisionIndices(getSelfCollisionIndices(srcCloth));
+
+ // rest positions
+ Vector<PxVec4>::Type restPositions(srcImpl.getNumRestPositions());
+ srcFactory.extractRestPositions(srcImpl, makeRange(restPositions));
+ dstImpl->setRestPositions(makeRange(restPositions));
+
+ // virtual particles
+ if(srcImpl.getNumVirtualParticles())
+ {
+ Vector<Vec4u>::Type indices(srcImpl.getNumVirtualParticles());
+ Vector<PxVec3>::Type weights(srcImpl.getNumVirtualParticleWeights(), PxVec3(0.0f));
+
+ uint32_t(*indicesPtr)[4] = indices.empty() ? 0 : &array(indices.front());
+ Range<uint32_t[4]> indicesRange(indicesPtr, indicesPtr + indices.size());
+
+ PxVec3* weightsPtr = weights.empty() ? 0 : &weights.front();
+ Range<PxVec3> weightsRange(weightsPtr, weightsPtr + weights.size());
+
+ srcFactory.extractVirtualParticles(srcImpl, indicesRange, weightsRange);
+
+ dstImpl->setVirtualParticles(indicesRange, weightsRange);
+ }
+
+ return dstImpl;
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h
new file mode 100644
index 00000000..b9ae0a53
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h
@@ -0,0 +1,45 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <cuda.h>
+#include <driver_types.h>
+
+namespace physx
+{
+namespace cloth
+{
+// implemented in CuFactory.cpp
+void checkSuccessImpl(CUresult, const char*, const int);
+}
+
+// safe cuda calls
+#define checkSuccess(err) cloth::checkSuccessImpl(err, __FILE__, __LINE__)
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp
new file mode 100644
index 00000000..6ecd1aeb
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp
@@ -0,0 +1,511 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuCloth.h"
+#include "CuFabric.h"
+#include "CuFactory.h"
+#include "CuContextLock.h"
+#include "CuCheckSuccess.h"
+#include "CuClothData.h"
+#include "CuSolver.h"
+#include "TripletScheduler.h"
+#include "ClothBase.h"
+#include "Array.h"
+#include "PsFoundation.h"
+
+#if PX_VC
+#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch
+#endif
+
+namespace physx
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp
+}
+}
+
+using namespace physx;
+
+namespace
+{
+bool isSelfCollisionEnabled(const cloth::CuCloth& cloth)
+{
+ return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f;
+}
+}
+
+cloth::CuCloth::CuCloth(CuFactory& factory, CuFabric& fabric, Range<const PxVec4> particles)
+: CuContextLock(factory)
+, mFactory(factory)
+, mFabric(fabric)
+, mClothDataDirty(false)
+, mNumParticles(uint32_t(particles.size()))
+, mParticles(mFactory.mContextManager)
+, mParticlesHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped))
+, mDeviceParticlesDirty(false)
+, mHostParticlesDirty(true)
+, mPhaseConfigs(mFactory.mContextManager)
+, mMotionConstraints(mFactory.mContextManager)
+, mSeparationConstraints(mFactory.mContextManager)
+, mParticleAccelerations(mFactory.mContextManager)
+, mParticleAccelerationsHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped))
+, mCapsuleIndices(getMappedAllocator<IndexPair>(mFactory.mContextManager))
+, mStartCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mTargetCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mConvexMasks(getMappedAllocator<uint32_t>(mFactory.mContextManager))
+, mStartCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mTargetCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mStartCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager))
+, mTargetCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager))
+, mVirtualParticleSetSizes(mFactory.mContextManager)
+, mVirtualParticleIndices(mFactory.mContextManager)
+, mVirtualParticleWeights(mFactory.mContextManager)
+, mRestPositions(mFactory.mContextManager)
+, mSelfCollisionIndices(mFactory.mContextManager)
+, mSelfCollisionData(mFactory.mContextManager)
+, mSharedMemorySize(0)
+, mUserData(0)
+{
+ PX_ASSERT(!particles.empty());
+
+ initialize(*this, particles.begin(), particles.end());
+
+ mParticles.reserve(2 * mNumParticles);
+ mParticles.push_back(particles.begin(), particles.end());
+ mParticles.push_back(particles.begin(), particles.end());
+ mParticlesHostCopy.resizeUninitialized(2 * mNumParticles);
+
+ mFabric.incRefCount();
+
+ CuContextLock::release();
+}
+
+cloth::CuCloth::CuCloth(CuFactory& factory, const CuCloth& cloth)
+: CuContextLock(factory)
+, mFactory(factory)
+, mFabric(cloth.mFabric)
+, mNumParticles(cloth.mNumParticles)
+, mParticles(cloth.mParticles)
+, mParticlesHostCopy(cloth.mParticlesHostCopy)
+, mDeviceParticlesDirty(cloth.mDeviceParticlesDirty)
+, mHostParticlesDirty(cloth.mHostParticlesDirty)
+, mPhaseConfigs(cloth.mPhaseConfigs)
+, mHostPhaseConfigs(cloth.mHostPhaseConfigs)
+, mMotionConstraints(cloth.mMotionConstraints)
+, mSeparationConstraints(cloth.mSeparationConstraints)
+, mParticleAccelerations(cloth.mParticleAccelerations)
+, mParticleAccelerationsHostCopy(cloth.mParticleAccelerationsHostCopy)
+, mCapsuleIndices(cloth.mCapsuleIndices)
+, mStartCollisionSpheres(cloth.mStartCollisionSpheres)
+, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres)
+, mStartCollisionPlanes(cloth.mStartCollisionPlanes)
+, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes)
+, mStartCollisionTriangles(cloth.mStartCollisionTriangles)
+, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles)
+, mVirtualParticleSetSizes(cloth.mVirtualParticleSetSizes)
+, mVirtualParticleIndices(cloth.mVirtualParticleIndices)
+, mVirtualParticleWeights(cloth.mVirtualParticleWeights)
+, mRestPositions(cloth.mRestPositions)
+, mSelfCollisionIndices(cloth.mSelfCollisionIndices)
+, mSelfCollisionData(mFactory.mContextManager)
+, mSharedMemorySize(cloth.mSharedMemorySize)
+, mUserData(cloth.mUserData)
+{
+ copy(*this, cloth);
+
+ mFabric.incRefCount();
+
+ CuContextLock::release();
+}
+
+cloth::CuCloth::~CuCloth()
+{
+ CuContextLock::acquire();
+
+ mFabric.decRefCount();
+}
+
+void cloth::CuCloth::notifyChanged()
+{
+ mClothDataDirty = true;
+}
+
+bool cloth::CuCloth::updateClothData(CuClothData& clothData)
+{
+ // test particle pointer to detect when cloth data array has been reordered
+ if(!mClothDataDirty && clothData.mParticles == array(*mParticles.begin().get()))
+ {
+ PX_ASSERT(mSharedMemorySize == getSharedMemorySize());
+ return false;
+ }
+
+ mSharedMemorySize = getSharedMemorySize();
+
+ if(mSelfCollisionData.empty() && isSelfCollisionEnabled(*this))
+ {
+ uint32_t numSelfCollisionIndices =
+ mSelfCollisionIndices.empty() ? mNumParticles : uint32_t(mSelfCollisionIndices.size());
+
+ uint32_t particleSize = 4 * mNumParticles;
+ uint32_t keySize = 2 * numSelfCollisionIndices; // 2x for radix buffer
+ uint32_t cellStartSize = (129 + 128 * 128 + 130) / 2 + 1; // half because type is int16_t
+
+ // use 16bit indices for cellStart array (128x128 grid)
+ mSelfCollisionData.resize(particleSize + keySize + cellStartSize);
+ checkSuccess(cuMemsetD32((mSelfCollisionData.begin() + particleSize + keySize).dev(), 0xffffffff, cellStartSize));
+ }
+
+ clothData = CuClothData(*this);
+ mClothDataDirty = false;
+
+ return true;
+}
+
+uint32_t cloth::CuCloth::getSharedMemorySize() const
+{
+ uint32_t numPhases = uint32_t(mPhaseConfigs.size());
+ uint32_t numSpheres = uint32_t(mStartCollisionSpheres.size());
+ uint32_t numCones = uint32_t(mCapsuleIndices.size());
+ uint32_t numPlanes = uint32_t(mStartCollisionPlanes.size());
+ uint32_t numConvexes = uint32_t(mConvexMasks.size());
+ uint32_t numTriangles = uint32_t(mStartCollisionTriangles.size() / 3);
+
+ uint32_t phaseConfigSize = numPhases * sizeof(CuPhaseConfig);
+
+ bool storePrevCollisionData = mEnableContinuousCollision || mFriction > 0.0f;
+ uint32_t continuousCollisionSize = storePrevCollisionData ? 4 * numSpheres + 10 * numCones : 0;
+ continuousCollisionSize += 4 * numCones + numConvexes; // capsule and convex masks
+ uint32_t discreteCollisionSize = 4 * numSpheres + PxMax(10 * numCones + 96, 208u);
+ discreteCollisionSize = PxMax(discreteCollisionSize, PxMax(4 * numPlanes, 19 * numTriangles));
+
+ // scratch memory for prefix sum and histogram
+ uint32_t selfCollisionSize = isSelfCollisionEnabled(*this) ? 544 : 0;
+
+ // see CuSolverKenel.cu::gSharedMemory comment for details
+ return phaseConfigSize + sizeof(float) * (continuousCollisionSize + PxMax(selfCollisionSize, discreteCollisionSize));
+}
+
+void cloth::CuCloth::setPhaseConfig(Range<const PhaseConfig> configs)
+{
+ mHostPhaseConfigs.assign(configs.begin(), configs.end());
+
+ Vector<CuPhaseConfig>::Type deviceConfigs;
+ deviceConfigs.reserve(configs.size());
+ const PhaseConfig* cEnd = configs.end();
+ for(const PhaseConfig* cIt = configs.begin(); cIt != cEnd; ++cIt)
+ {
+ CuPhaseConfig config;
+
+ config.mStiffness = cIt->mStiffness;
+ config.mStiffnessMultiplier = cIt->mStiffnessMultiplier;
+ config.mCompressionLimit = cIt->mCompressionLimit;
+ config.mStretchLimit = cIt->mStretchLimit;
+
+ uint16_t phaseIndex = cIt->mPhaseIndex;
+ config.mNumConstraints = mFabric.mNumConstraintsInPhase[phaseIndex];
+ config.mRestvalues = mFabric.mRestvaluesInPhase[phaseIndex].get();
+ config.mIndices = mFabric.mIndicesInPhase[phaseIndex].get();
+
+ deviceConfigs.pushBack(config);
+ }
+
+ CuContextLock contextLock(mFactory);
+ mPhaseConfigs.assign(deviceConfigs.begin(), deviceConfigs.end());
+}
+
+cloth::Range<PxVec4> cloth::CuCloth::push(cloth::CuConstraints& constraints)
+{
+ if(!constraints.mTarget.capacity())
+ {
+ CuContextLock contextLock(mFactory);
+ constraints.mTarget.reserve(mNumParticles);
+ }
+ if(constraints.mHostCopy.empty())
+ constraints.mTarget.resize(mNumParticles);
+
+ if(constraints.mStart.empty()) // initialize start first
+ constraints.mStart.swap(constraints.mTarget);
+
+ if(!constraints.mHostCopy.capacity())
+ {
+ CuContextLock contextLock(mFactory);
+ constraints.mHostCopy.reserve(mNumParticles);
+ }
+ constraints.mHostCopy.resizeUninitialized(mNumParticles);
+
+ PxVec4* data = &constraints.mHostCopy.front();
+ return Range<PxVec4>(data, data + constraints.mHostCopy.size());
+}
+
+void cloth::CuCloth::clear(cloth::CuConstraints& constraints)
+{
+ CuContextLock contextLock(mFactory);
+ CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mStart);
+ CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mTarget);
+}
+
+void cloth::CuCloth::syncDeviceParticles()
+{
+ if(mDeviceParticlesDirty)
+ {
+ CuContextLock contextLock(mFactory);
+ checkSuccess(
+ cuMemcpyHtoD(mParticles.begin().dev(), mParticlesHostCopy.begin(), 2 * mNumParticles * sizeof(PxVec4)));
+ mDeviceParticlesDirty = false;
+ }
+}
+
+void cloth::CuCloth::syncHostParticles()
+{
+ if(mHostParticlesDirty)
+ {
+ CuContextLock contextLock(mFactory);
+ const PxVec4* src = mParticles.begin().get();
+ mFactory.copyToHost(src, src + 2 * mNumParticles, mParticlesHostCopy.begin());
+ mHostParticlesDirty = false;
+ }
+}
+
+cloth::Range<const PxVec3> cloth::CuCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t replaceSize)
+{
+ // clamp to 500 triangles (1500 vertices) to prevent running out of shared memory
+ uint32_t removedSize = mStartCollisionTriangles.size() - replaceSize;
+ const PxVec3* clamp = range.begin() + 1500 - removedSize;
+
+ if(range.end() > clamp)
+ {
+ shdfnd::getFoundation().error(PX_WARN, "Too many collision "
+ "triangles specified for cloth, dropping all but first 500.\n");
+ }
+
+ return Range<const PxVec3>(range.begin(), PxMin(range.end(), clamp));
+}
+
+#include "ClothImpl.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+// ClothImpl<CuCloth>::clone() implemented in CuClothClone.cpp
+
+template <>
+uint32_t ClothImpl<CuCloth>::getNumParticles() const
+{
+ return mCloth.mNumParticles;
+}
+
+template <>
+void ClothImpl<CuCloth>::lockParticles() const
+{
+ const_cast<CuCloth&>(mCloth).syncHostParticles();
+}
+
+template <>
+void ClothImpl<CuCloth>::unlockParticles() const
+{
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<CuCloth>::getCurrentParticles()
+{
+ mCloth.wakeUp();
+ lockParticles();
+ mCloth.mDeviceParticlesDirty = true;
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<CuCloth>::getCurrentParticles() const
+{
+ lockParticles();
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin());
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<CuCloth>::getPreviousParticles()
+{
+ mCloth.wakeUp();
+ lockParticles();
+ mCloth.mDeviceParticlesDirty = true;
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles);
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<CuCloth>::getPreviousParticles() const
+{
+ lockParticles();
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles);
+}
+
+template <>
+GpuParticles ClothImpl<CuCloth>::getGpuParticles()
+{
+ mCloth.syncDeviceParticles();
+ mCloth.mHostParticlesDirty = true;
+ PxVec4* particles = mCloth.mParticles.begin().get();
+ GpuParticles result = { particles, particles + mCloth.mNumParticles, 0 };
+ return result;
+}
+
+template <>
+void ClothImpl<CuCloth>::setPhaseConfig(Range<const PhaseConfig> configs)
+{
+ Vector<PhaseConfig>::Type transformedConfigs;
+ transformedConfigs.reserve(configs.size());
+
+ // transform phase config to use in solver
+ for(; !configs.empty(); configs.popFront())
+ if(configs.front().mStiffness > 0.0f)
+ transformedConfigs.pushBack(transform(configs.front()));
+
+ mCloth.setPhaseConfig(Range<const PhaseConfig>(transformedConfigs.begin(), transformedConfigs.end()));
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<CuCloth>::setSelfCollisionIndices(Range<const uint32_t> indices)
+{
+ ContextLockType lock(mCloth.mFactory);
+ mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end());
+ mCloth.mSelfCollisionIndicesHost.assign(indices.begin(), indices.end());
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <>
+uint32_t ClothImpl<CuCloth>::getNumVirtualParticles() const
+{
+ return uint32_t(mCloth.mVirtualParticleIndices.size());
+}
+
+template <>
+Range<PxVec4> ClothImpl<CuCloth>::getParticleAccelerations()
+{
+ if(mCloth.mParticleAccelerations.empty())
+ {
+ CuContextLock contextLock(mCloth.mFactory);
+ mCloth.mParticleAccelerations.resize(mCloth.mNumParticles);
+ }
+
+ if(!mCloth.mParticleAccelerationsHostCopy.capacity())
+ {
+ CuContextLock contextLock(mCloth.mFactory);
+ mCloth.mParticleAccelerationsHostCopy.reserve(mCloth.mNumParticles);
+ }
+ mCloth.mParticleAccelerationsHostCopy.resizeUninitialized(mCloth.mNumParticles);
+
+ mCloth.wakeUp();
+
+ PxVec4* data = mCloth.mParticleAccelerationsHostCopy.begin();
+ return Range<PxVec4>(data, mCloth.mParticleAccelerationsHostCopy.end());
+}
+
+template <>
+void ClothImpl<CuCloth>::clearParticleAccelerations()
+{
+ CuContextLock contextLock(mCloth.mFactory);
+ CuDeviceVector<PxVec4>(mCloth.mFactory.mContextManager).swap(mCloth.mParticleAccelerations);
+ mCloth.mParticleAccelerationsHostCopy.reset();
+ mCloth.wakeUp();
+}
+
+namespace
+{
+uint32_t calculateNumReplays(const Vector<Vec4u>::Type& triplets, const Vector<uint32_t>::Type setSizes)
+{
+ uint32_t result = 0;
+
+ Vector<Vec4u>::Type::ConstIterator tIt = triplets.begin();
+ Vector<uint32_t>::Type::ConstIterator sIt, sEnd = setSizes.end();
+ uint32_t index = 0;
+ for(sIt = setSizes.begin(); sIt != sEnd; ++sIt, ++index)
+ {
+ Vector<Vec4u>::Type::ConstIterator tEnd = tIt + *sIt, tLast = tIt;
+ while(tLast != tEnd)
+ {
+ uint8_t numConflicts[3][32] = {};
+ uint8_t numReplays[3] = {};
+
+ for(tLast += PxMin(ptrdiff_t(32), tEnd - tLast); tIt != tLast; ++tIt)
+ for(int i = 0; i < 3; ++i)
+ numReplays[i] = PxMax(numReplays[i], ++numConflicts[i][(*tIt)[i] & 31]);
+
+ result += numReplays[0] + numReplays[1] + numReplays[2];
+ }
+ }
+
+ return result;
+}
+}
+
+template <>
+void ClothImpl<CuCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights)
+{
+ // shuffle indices to form independent SIMD sets
+ TripletScheduler scheduler(indices);
+ scheduler.warp(mCloth.mNumParticles, 32);
+
+ // convert to 16bit indices
+ Vector<Vec4us>::Type hostIndices;
+ hostIndices.reserve(indices.size());
+ TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin();
+ TripletScheduler::ConstTripletIter tEnd = scheduler.mTriplets.end();
+ for(; tIt != tEnd; ++tIt)
+ hostIndices.pushBack(Vec4us(*tIt));
+
+ // printf("num sets = %u, num replays = %u\n", scheduler.mSetSizes.size(),
+ // calculateNumReplays(scheduler.mTriplets, scheduler.mSetSizes));
+
+ // add normalization weight
+ Vector<PxVec4>::Type hostWeights;
+ hostWeights.reserve(weights.size());
+ for(; !weights.empty(); weights.popFront())
+ {
+ PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front());
+ PxReal scale = 1 / w.magnitudeSquared();
+ hostWeights.pushBack(PxVec4(w.x, w.y, w.z, scale));
+ }
+
+ CuContextLock contextLock(mCloth.mFactory);
+
+ // todo: 'swap' these to force reallocation?
+ mCloth.mVirtualParticleIndices = hostIndices;
+ mCloth.mVirtualParticleSetSizes = scheduler.mSetSizes;
+ mCloth.mVirtualParticleWeights = hostWeights;
+
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h
new file mode 100644
index 00000000..257d490c
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h
@@ -0,0 +1,216 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxTransform.h"
+#include "foundation/PxVec4.h"
+#include "Range.h"
+#include "PhaseConfig.h"
+#include "MovingAverage.h"
+#include "IndexPair.h"
+#include "BoundingBox.h"
+#include "Vec4T.h"
+#include "CuPhaseConfig.h"
+#include "CuPinnedAllocator.h"
+#include "CuContextLock.h"
+#include "CuDeviceVector.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+class CuFabric;
+class CuFactory;
+struct CuClothData;
+
+struct CuConstraints
+{
+ CuConstraints(physx::PxCudaContextManager* ctx)
+ : mStart(ctx), mTarget(ctx), mHostCopy(CuHostAllocator(ctx, cudaHostAllocMapped))
+ {
+ }
+
+ void pop()
+ {
+ if(!mTarget.empty())
+ {
+ mStart.swap(mTarget);
+ mTarget.resize(0);
+ }
+ }
+
+ CuDeviceVector<PxVec4> mStart;
+ CuDeviceVector<PxVec4> mTarget;
+ CuPinnedVector<PxVec4>::Type mHostCopy;
+};
+
+class CuCloth : protected CuContextLock
+{
+ public:
+ CuCloth& operator=(const CuCloth&);
+ typedef CuFactory FactoryType;
+ typedef CuFabric FabricType;
+ typedef CuContextLock ContextLockType;
+
+ typedef CuPinnedVector<PxVec4>::Type& MappedVec4fVectorType;
+ typedef CuPinnedVector<IndexPair>::Type& MappedIndexVectorType;
+
+ CuCloth(CuFactory&, CuFabric&, Range<const PxVec4>);
+ CuCloth(CuFactory&, const CuCloth&);
+ ~CuCloth(); // not virtual on purpose
+
+ public:
+ bool isSleeping() const
+ {
+ return mSleepPassCounter >= mSleepAfterCount;
+ }
+ void wakeUp()
+ {
+ mSleepPassCounter = 0;
+ }
+
+ void notifyChanged();
+
+ bool updateClothData(CuClothData&); // expects acquired context
+ uint32_t getSharedMemorySize() const; // without particle data
+
+ // expects transformed configs, doesn't call notifyChanged()
+ void setPhaseConfig(Range<const PhaseConfig>);
+
+ Range<PxVec4> push(CuConstraints&);
+ void clear(CuConstraints&);
+
+ void syncDeviceParticles();
+ void syncHostParticles();
+
+ Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t);
+
+ public:
+ CuFactory& mFactory;
+ CuFabric& mFabric;
+
+ bool mClothDataDirty;
+
+ // particle data
+ uint32_t mNumParticles;
+ CuDeviceVector<PxVec4> mParticles; // cur, prev
+ CuPinnedVector<PxVec4>::Type mParticlesHostCopy;
+ bool mDeviceParticlesDirty;
+ bool mHostParticlesDirty;
+
+ PxVec3 mParticleBoundsCenter;
+ PxVec3 mParticleBoundsHalfExtent;
+
+ PxVec3 mGravity;
+ PxVec3 mLogDamping;
+ PxVec3 mLinearLogDrag;
+ PxVec3 mAngularLogDrag;
+ PxVec3 mLinearInertia;
+ PxVec3 mAngularInertia;
+ PxVec3 mCentrifugalInertia;
+ float mSolverFrequency;
+ float mStiffnessFrequency;
+
+ PxTransform mTargetMotion;
+ PxTransform mCurrentMotion;
+ PxVec3 mLinearVelocity;
+ PxVec3 mAngularVelocity;
+
+ float mPrevIterDt;
+ MovingAverage mIterDtAvg;
+
+ CuDeviceVector<CuPhaseConfig> mPhaseConfigs; // transformed!
+ Vector<PhaseConfig>::Type mHostPhaseConfigs; // transformed!
+
+ // tether constraints stuff
+ float mTetherConstraintLogStiffness;
+ float mTetherConstraintScale;
+
+ // motion constraints stuff
+ CuConstraints mMotionConstraints;
+ float mMotionConstraintScale;
+ float mMotionConstraintBias;
+ float mMotionConstraintLogStiffness;
+
+ // separation constraints stuff
+ CuConstraints mSeparationConstraints;
+
+ // particle acceleration stuff
+ CuDeviceVector<PxVec4> mParticleAccelerations;
+ CuPinnedVector<PxVec4>::Type mParticleAccelerationsHostCopy;
+
+ // wind
+ PxVec3 mWind;
+ float mDragLogCoefficient;
+ float mLiftLogCoefficient;
+
+ // collision stuff
+ CuPinnedVector<IndexPair>::Type mCapsuleIndices;
+ CuPinnedVector<PxVec4>::Type mStartCollisionSpheres;
+ CuPinnedVector<PxVec4>::Type mTargetCollisionSpheres;
+ CuPinnedVector<uint32_t>::Type mConvexMasks;
+ CuPinnedVector<PxVec4>::Type mStartCollisionPlanes;
+ CuPinnedVector<PxVec4>::Type mTargetCollisionPlanes;
+ CuPinnedVector<PxVec3>::Type mStartCollisionTriangles;
+ CuPinnedVector<PxVec3>::Type mTargetCollisionTriangles;
+ bool mEnableContinuousCollision;
+ float mCollisionMassScale;
+ float mFriction;
+
+ // virtual particles
+ CuDeviceVector<uint32_t> mVirtualParticleSetSizes;
+ CuDeviceVector<Vec4us> mVirtualParticleIndices;
+ CuDeviceVector<PxVec4> mVirtualParticleWeights;
+
+ // self collision
+ float mSelfCollisionDistance;
+ float mSelfCollisionLogStiffness;
+
+ CuDeviceVector<PxVec4> mRestPositions;
+ CuDeviceVector<uint32_t> mSelfCollisionIndices;
+ Vector<uint32_t>::Type mSelfCollisionIndicesHost;
+
+ // 4 (position) + 2 (key) per particle + cellStart (8322)
+ CuDeviceVector<float> mSelfCollisionData;
+
+ // sleeping (see SwCloth for comments)
+ uint32_t mSleepTestInterval;
+ uint32_t mSleepAfterCount;
+ float mSleepThreshold;
+ uint32_t mSleepPassCounter;
+ uint32_t mSleepTestCounter;
+
+ uint32_t mSharedMemorySize;
+
+ void* mUserData;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp
new file mode 100644
index 00000000..8b234968
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp
@@ -0,0 +1,83 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "ClothClone.h"
+
+#include "CuFactory.h"
+#include "CuFabric.h"
+#include "CuCloth.h"
+
+namespace physx
+{
+namespace cloth
+{
+Range<const PhaseConfig> getPhaseConfigs(const CuCloth& cloth)
+{
+ return makeRange(cloth.mHostPhaseConfigs);
+}
+void setPhaseConfigs(CuCloth& cloth, Range<const PhaseConfig> phaseConfigs)
+{
+ cloth.setPhaseConfig(phaseConfigs);
+}
+Range<const PxVec4> getParticleAccelerations(const CuCloth& cloth)
+{
+ return makeRange(cloth.mParticleAccelerationsHostCopy);
+}
+Range<const uint32_t> getSelfCollisionIndices(const CuCloth& cloth)
+{
+ return makeRange(cloth.mSelfCollisionIndicesHost);
+}
+
+template <>
+Cloth* ClothImpl<CuCloth>::clone(Factory& factory) const
+{
+ if(&mCloth.mFactory == &factory)
+ return new ClothImpl<CuCloth>(factory, *this); // copy construct directly
+
+ switch(factory.getPlatform())
+ {
+ case Factory::CPU:
+ return convertCloth(static_cast<SwFactory&>(factory), *this);
+ case Factory::CUDA:
+ return convertCloth(static_cast<CuFactory&>(factory), *this);
+ default:
+ return NULL;
+ }
+}
+
+Cloth* CuFactory::clone(const Cloth& cloth)
+{
+ if(cloth.getFactory().getPlatform() == Factory::CPU)
+ return convertCloth(*this, static_cast<const SwClothImpl&>(cloth));
+
+ return cloth.clone(*this);
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp
new file mode 100644
index 00000000..5a1485c6
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp
@@ -0,0 +1,238 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuClothData.h"
+#include "CuCloth.h"
+#include "CuFabric.h"
+#include "CuCheckSuccess.h"
+#include "CuContextLock.h"
+#include "IterationState.h"
+
+using namespace physx;
+
+cloth::CuClothData::CuClothData(CuCloth& cloth)
+{
+ mNumParticles = cloth.mNumParticles;
+ mParticles = array(*cloth.mParticles.begin().get());
+
+ mParticlesHostCopy = array(*getDevicePointer(cloth.mParticlesHostCopy));
+
+ mNumPhases = uint32_t(cloth.mPhaseConfigs.size());
+ mPhaseConfigs = cloth.mPhaseConfigs.begin().get();
+
+ mTethers = cloth.mFabric.mTethers.begin().get();
+ mNumTethers = uint32_t(cloth.mFabric.mTethers.size());
+ mTetherConstraintScale = cloth.mTetherConstraintScale * cloth.mFabric.mTetherLengthScale;
+
+ mTriangles = cloth.mFabric.mTriangles.begin().get();
+ mNumTriangles = uint32_t(cloth.mFabric.mTriangles.size()) / 3;
+
+ mMotionConstraintScale = cloth.mMotionConstraintScale;
+ mMotionConstraintBias = cloth.mMotionConstraintBias;
+
+ mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size());
+ mNumCapsules = uint32_t(cloth.mCapsuleIndices.size());
+ mCapsuleIndices = getDevicePointer(cloth.mCapsuleIndices);
+
+ mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size());
+ mNumConvexes = uint32_t(cloth.mConvexMasks.size());
+ mConvexMasks = getDevicePointer(cloth.mConvexMasks);
+
+ mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3;
+
+ mVirtualParticleSetSizesBegin = cloth.mVirtualParticleSetSizes.begin().get();
+ mVirtualParticleSetSizesEnd = mVirtualParticleSetSizesBegin + cloth.mVirtualParticleSetSizes.size();
+ mVirtualParticleIndices = array(*cloth.mVirtualParticleIndices.begin().get());
+ mVirtualParticleWeights = array(*cloth.mVirtualParticleWeights.begin().get());
+
+ mEnableContinuousCollision = cloth.mEnableContinuousCollision;
+ mCollisionMassScale = cloth.mCollisionMassScale;
+ mFrictionScale = cloth.mFriction;
+
+ mSelfCollisionDistance = cloth.mSelfCollisionDistance;
+ mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin().get();
+ mNumSelfCollisionIndices = mSelfCollisionIndices ? uint32_t(cloth.mSelfCollisionIndices.size()) : mNumParticles;
+
+ if(!cloth.mSelfCollisionData.empty())
+ {
+ uint32_t keySize = 2 * mNumSelfCollisionIndices;
+ uint32_t particleSize = 4 * mNumParticles;
+
+ mSelfCollisionParticles = cloth.mSelfCollisionData.begin().get();
+ mSelfCollisionKeys = (uint32_t*)(mSelfCollisionParticles + particleSize);
+ mSelfCollisionCellStart = (uint16_t*)(mSelfCollisionKeys + keySize);
+ }
+ else
+ {
+ mSelfCollisionParticles = 0;
+ mSelfCollisionKeys = 0;
+ mSelfCollisionCellStart = 0;
+ }
+
+ mSleepTestInterval = cloth.mSleepTestInterval;
+ mSleepAfterCount = cloth.mSleepAfterCount;
+ mSleepThreshold = cloth.mSleepThreshold;
+}
+
+cloth::CuFrameData::CuFrameData(CuCloth& cloth, uint32_t numSharedPositions, const IterationState<Simd4f>& state,
+ const CuIterationData* iterationData)
+{
+ mDeviceParticlesDirty = cloth.mDeviceParticlesDirty;
+
+ mNumSharedPositions = numSharedPositions;
+
+ mIterDt = state.mIterDt;
+ mNumIterations = state.mRemainingIterations;
+ mIterationData = iterationData;
+
+ Simd4f logStiffness = simd4f(0.0f, cloth.mSelfCollisionLogStiffness, cloth.mMotionConstraintLogStiffness,
+ cloth.mTetherConstraintLogStiffness);
+ Simd4f stiffnessExponent = simd4f(cloth.mStiffnessFrequency * mIterDt);
+ Simd4f stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent);
+
+ mTetherConstraintStiffness = array(stiffness)[3];
+ mMotionConstraintStiffness = array(stiffness)[2];
+ mSelfCollisionStiffness = array(stiffness)[1];
+
+ logStiffness = simd4f(cloth.mDragLogCoefficient, cloth.mLiftLogCoefficient, 0.0f, 0.0f);
+ stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent);
+ mDragCoefficient = array(stiffness)[0];
+ mLiftCoefficient = array(stiffness)[1];
+ for(int i = 0; i < 9; ++i)
+ mRotation[i] = array(state.mRotationMatrix[i / 3])[i % 3];
+
+ mTargetMotionConstraints = 0;
+ if(!cloth.mMotionConstraints.mStart.empty())
+ {
+ mTargetMotionConstraints = array(*cloth.mMotionConstraints.mStart.begin().get());
+ }
+
+ mStartMotionConstraints = mTargetMotionConstraints;
+ if(!cloth.mMotionConstraints.mTarget.empty())
+ {
+ mTargetMotionConstraints = array(*cloth.mMotionConstraints.mTarget.begin().get());
+ }
+
+ mHostMotionConstraints = array(*getDevicePointer(cloth.mMotionConstraints.mHostCopy));
+
+ mTargetSeparationConstraints = 0;
+ if(!cloth.mSeparationConstraints.mStart.empty())
+ {
+ mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mStart.begin().get());
+ }
+
+ mStartSeparationConstraints = mTargetSeparationConstraints;
+ if(!cloth.mSeparationConstraints.mTarget.empty())
+ {
+ mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mTarget.begin().get());
+ }
+
+ mHostSeparationConstraints = array(*getDevicePointer(cloth.mSeparationConstraints.mHostCopy));
+
+ mParticleAccelerations = 0;
+ if(!cloth.mParticleAccelerations.empty())
+ {
+ mParticleAccelerations = array(*cloth.mParticleAccelerations.begin().get());
+ }
+
+ mHostParticleAccelerations = array(*getDevicePointer(cloth.mParticleAccelerationsHostCopy));
+
+ mRestPositions = 0;
+ if(!cloth.mRestPositions.empty())
+ {
+ mRestPositions = array(*cloth.mRestPositions.begin().get());
+ }
+
+ mStartCollisionSpheres = array(*getDevicePointer(cloth.mStartCollisionSpheres));
+ mTargetCollisionSpheres = array(*getDevicePointer(cloth.mTargetCollisionSpheres));
+
+ if(!mTargetCollisionSpheres)
+ mTargetCollisionSpheres = mStartCollisionSpheres;
+
+ mStartCollisionPlanes = array(*getDevicePointer(cloth.mStartCollisionPlanes));
+ mTargetCollisionPlanes = array(*getDevicePointer(cloth.mTargetCollisionPlanes));
+
+ if(!mTargetCollisionPlanes)
+ mTargetCollisionPlanes = mStartCollisionPlanes;
+
+ mStartCollisionTriangles = array(*getDevicePointer(cloth.mStartCollisionTriangles));
+ mTargetCollisionTriangles = array(*getDevicePointer(cloth.mTargetCollisionTriangles));
+
+ if(!mTargetCollisionTriangles)
+ mTargetCollisionTriangles = mStartCollisionTriangles;
+
+ for(uint32_t i = 0; i < 3; ++i)
+ {
+ float c = cloth.mParticleBoundsCenter[i];
+ float r = cloth.mParticleBoundsHalfExtent[i];
+ mParticleBounds[i * 2 + 0] = r + c;
+ mParticleBounds[i * 2 + 1] = r - c;
+ }
+
+ mSleepPassCounter = cloth.mSleepPassCounter;
+ mSleepTestCounter = cloth.mSleepTestCounter;
+
+ mStiffnessExponent = cloth.mStiffnessFrequency * mIterDt;
+}
+
+namespace
+{
+void copySquareTransposed(float* dst, const float* src)
+{
+ dst[0] = src[0];
+ dst[1] = src[4];
+ dst[2] = src[8];
+ dst[3] = src[1];
+ dst[4] = src[5];
+ dst[5] = src[9];
+ dst[6] = src[2];
+ dst[7] = src[6];
+ dst[8] = src[10];
+}
+}
+
+cloth::CuIterationData::CuIterationData(const IterationState<Simd4f>& state)
+{
+ mIntegrationTrafo[0] = array(state.mPrevBias)[0];
+ mIntegrationTrafo[1] = array(state.mPrevBias)[1];
+ mIntegrationTrafo[2] = array(state.mPrevBias)[2];
+
+ mIntegrationTrafo[3] = array(state.mCurBias)[0];
+ mIntegrationTrafo[4] = array(state.mCurBias)[1];
+ mIntegrationTrafo[5] = array(state.mCurBias)[2];
+
+ copySquareTransposed(mIntegrationTrafo + 6, array(*state.mPrevMatrix));
+ copySquareTransposed(mIntegrationTrafo + 15, array(*state.mCurMatrix));
+
+ mWind[0] = array(state.mWind)[0];
+ mWind[1] = array(state.mWind)[1];
+ mWind[2] = array(state.mWind)[2];
+
+ mIsTurning = state.mIsTurning ? 0x3F800000u : 0; // 1.0f to avoid ftz
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h
new file mode 100644
index 00000000..0be66742
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h
@@ -0,0 +1,191 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#ifndef __CUDACC__
+#include "Simd.h"
+#endif
+
+namespace physx
+{
+namespace cloth
+{
+
+class CuCloth;
+struct CuPhaseConfig;
+template <typename>
+struct IterationState;
+struct IndexPair;
+struct CuIterationData;
+struct CuTether;
+
+// reference to cloth instance bulk data (POD)
+// should not need frequent updates (stored on device)
+struct CuClothData
+{
+ CuClothData()
+ {
+ }
+ CuClothData(CuCloth&);
+
+ // particle data
+ uint32_t mNumParticles;
+ float* mParticles;
+ float* mParticlesHostCopy;
+
+ // fabric constraints
+ uint32_t mNumPhases;
+ const CuPhaseConfig* mPhaseConfigs;
+
+ const CuTether* mTethers;
+ uint32_t mNumTethers;
+ float mTetherConstraintScale;
+
+ const uint16_t* mTriangles;
+ uint32_t mNumTriangles;
+
+ // motion constraint data
+ float mMotionConstraintScale;
+ float mMotionConstraintBias;
+
+ // collision data
+ uint32_t mNumSpheres; // don't change this order, it's
+ uint32_t mNumCapsules; // needed by mergeAcceleration()
+ const IndexPair* mCapsuleIndices;
+ uint32_t mNumPlanes;
+ uint32_t mNumConvexes;
+ const uint32_t* mConvexMasks;
+ uint32_t mNumCollisionTriangles;
+
+ // virtual particle data
+ const uint32_t* mVirtualParticleSetSizesBegin;
+ const uint32_t* mVirtualParticleSetSizesEnd;
+ const uint16_t* mVirtualParticleIndices;
+ const float* mVirtualParticleWeights;
+
+ bool mEnableContinuousCollision;
+ float mCollisionMassScale;
+ float mFrictionScale;
+
+ float mSelfCollisionDistance;
+ uint32_t mNumSelfCollisionIndices;
+ const uint32_t* mSelfCollisionIndices;
+ float* mSelfCollisionParticles;
+ uint32_t* mSelfCollisionKeys;
+ uint16_t* mSelfCollisionCellStart;
+
+ // sleep data
+ uint32_t mSleepTestInterval;
+ uint32_t mSleepAfterCount;
+ float mSleepThreshold;
+};
+
+// per-frame data (stored in pinned memory)
+struct CuFrameData
+{
+ CuFrameData()
+ {
+ } // not initializing pointers to 0!
+
+#ifndef __CUDACC__
+ explicit CuFrameData(CuCloth&, uint32_t, const IterationState<Simd4f>&, const CuIterationData*);
+#endif
+
+ bool mDeviceParticlesDirty;
+
+ // number of particle copies that fit in shared memory (0, 1, or 2)
+ uint32_t mNumSharedPositions;
+
+ // iteration data
+ float mIterDt;
+ uint32_t mNumIterations;
+ const CuIterationData* mIterationData;
+
+ float mTetherConstraintStiffness;
+
+ // wind data
+ float mDragCoefficient;
+ float mLiftCoefficient;
+ float mRotation[9];
+
+ // motion constraint data
+ const float* mStartMotionConstraints;
+ float* mTargetMotionConstraints;
+ const float* mHostMotionConstraints;
+ float mMotionConstraintStiffness;
+
+ // separation constraint data
+ const float* mStartSeparationConstraints;
+ float* mTargetSeparationConstraints;
+ const float* mHostSeparationConstraints;
+
+ // particle acceleration data
+ float* mParticleAccelerations;
+ const float* mHostParticleAccelerations;
+
+ // rest positions
+ const float* mRestPositions;
+
+ // collision data
+ const float* mStartCollisionSpheres;
+ const float* mTargetCollisionSpheres;
+ const float* mStartCollisionPlanes;
+ const float* mTargetCollisionPlanes;
+ const float* mStartCollisionTriangles;
+ const float* mTargetCollisionTriangles;
+
+ float mSelfCollisionStiffness;
+
+ float mParticleBounds[6]; // maxX, -minX, maxY, ...
+
+ uint32_t mSleepPassCounter;
+ uint32_t mSleepTestCounter;
+
+ float mStiffnessExponent;
+};
+
+// per-iteration data (stored in pinned memory)
+struct CuIterationData
+{
+ CuIterationData()
+ {
+ } // not initializing!
+
+#ifndef __CUDACC__
+ explicit CuIterationData(const IterationState<Simd4f>&);
+#endif
+
+ float mIntegrationTrafo[24];
+ float mWind[3];
+ uint32_t mIsTurning;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h
new file mode 100644
index 00000000..cd28a999
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h
@@ -0,0 +1,1505 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifndef CU_SOLVER_KERNEL_CU
+#error include CuCollision.h only from CuSolverKernel.cu
+#endif
+
+#include "IndexPair.h"
+
+namespace
+{
+struct CuCollision
+{
+ struct ShapeMask
+ {
+ uint32_t mSpheres;
+ uint32_t mCones;
+
+ __device__ friend ShapeMask& operator&=(ShapeMask& left, const ShapeMask& right)
+ {
+ left.mSpheres = left.mSpheres & right.mSpheres;
+ left.mCones = left.mCones & right.mCones;
+ return left;
+ }
+ };
+
+ struct CollisionData
+ {
+ Pointer<Shared, float> mSphereX;
+ Pointer<Shared, float> mSphereY;
+ Pointer<Shared, float> mSphereZ;
+ Pointer<Shared, float> mSphereW;
+
+ Pointer<Shared, float> mConeCenterX;
+ Pointer<Shared, float> mConeCenterY;
+ Pointer<Shared, float> mConeCenterZ;
+ Pointer<Shared, float> mConeRadius;
+ Pointer<Shared, float> mConeAxisX;
+ Pointer<Shared, float> mConeAxisY;
+ Pointer<Shared, float> mConeAxisZ;
+ Pointer<Shared, float> mConeSlope;
+ Pointer<Shared, float> mConeSqrCosine;
+ Pointer<Shared, float> mConeHalfLength;
+ };
+
+ public:
+ __device__ CuCollision(Pointer<Shared, uint32_t>);
+
+ template <typename CurrentT, typename PreviousT>
+ __device__ void operator()(CurrentT& current, PreviousT& previous, float alpha);
+
+ private:
+ __device__ void buildSphereAcceleration(const CollisionData&);
+ __device__ void buildConeAcceleration();
+ __device__ void mergeAcceleration();
+
+ template <typename CurrentT>
+ __device__ bool buildAcceleration(const CurrentT&, float);
+
+ __device__ static ShapeMask readShapeMask(const float&, Pointer<Shared, const uint32_t>);
+ template <typename CurPos>
+ __device__ ShapeMask getShapeMask(const CurPos&) const;
+ template <typename PrevPos, typename CurPos>
+ __device__ ShapeMask getShapeMask(const PrevPos&, const CurPos&) const;
+
+ template <typename CurPos>
+ __device__ int32_t collideCapsules(const CurPos&, float3&, float3&) const;
+ template <typename PrevPos, typename CurPos>
+ __device__ int32_t collideCapsules(const PrevPos&, CurPos&, float3&, float3&) const;
+
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideCapsules(CurrentT& current, PreviousT& previous) const;
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideVirtualCapsules(CurrentT& current, PreviousT& previous) const;
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideContinuousCapsules(CurrentT& current, PreviousT& previous) const;
+
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideConvexes(CurrentT& current, PreviousT& previous, float alpha);
+ template <typename CurPos>
+ __device__ int32_t collideConvexes(const CurPos&, float3&) const;
+
+ template <typename CurrentT>
+ __device__ void collideTriangles(CurrentT& current, float alpha);
+ template <typename CurrentT>
+ __device__ void collideTriangles(CurrentT& current, int32_t i);
+
+ public:
+ Pointer<Shared, uint32_t> mCapsuleIndices;
+ Pointer<Shared, uint32_t> mCapsuleMasks;
+ Pointer<Shared, uint32_t> mConvexMasks;
+
+ CollisionData mPrevData;
+ CollisionData mCurData;
+
+ // acceleration structure
+ Pointer<Shared, uint32_t> mShapeGrid;
+ float mGridScale[3];
+ float mGridBias[3];
+ static const uint32_t sGridSize = 8;
+};
+
+template <typename T>
+__device__ void swap(T& a, T& b)
+{
+ T c = a;
+ a = b;
+ b = c;
+}
+}
+
+__shared__ uninitialized<CuCollision> gCollideParticles;
+
+namespace
+{
+// initializes one pointer past data!
+__device__ void allocate(CuCollision::CollisionData& data)
+{
+ if(threadIdx.x < 15)
+ {
+ Pointer<Shared, float>* ptr = &data.mSphereX;
+ ptr[threadIdx.x] = *ptr + threadIdx.x * gClothData.mNumCapsules +
+ min(threadIdx.x, 4) * (gClothData.mNumSpheres - gClothData.mNumCapsules);
+ }
+}
+
+__device__ void generateSpheres(CuCollision::CollisionData& data, float alpha)
+{
+ // interpolate spheres and transpose
+ if(threadIdx.x < gClothData.mNumSpheres * 4)
+ {
+ float start = __ldg(gFrameData.mStartCollisionSpheres + threadIdx.x);
+ float target = __ldg(gFrameData.mTargetCollisionSpheres + threadIdx.x);
+ float value = start + (target - start) * alpha;
+ if(threadIdx.x % 4 == 3)
+ value = max(value, 0.0f);
+ int32_t j = threadIdx.x % 4 * gClothData.mNumSpheres + threadIdx.x / 4;
+ data.mSphereX[j] = value;
+ }
+
+ __syncthreads();
+}
+
+__device__ void generateCones(CuCollision::CollisionData& data, Pointer<Shared, const uint32_t> iIt)
+{
+ // generate cones
+ if(threadIdx.x < gClothData.mNumCapsules)
+ {
+ uint32_t firstIndex = iIt[0];
+ uint32_t secondIndex = iIt[1];
+
+ float firstX = data.mSphereX[firstIndex];
+ float firstY = data.mSphereY[firstIndex];
+ float firstZ = data.mSphereZ[firstIndex];
+ float firstW = data.mSphereW[firstIndex];
+
+ float secondX = data.mSphereX[secondIndex];
+ float secondY = data.mSphereY[secondIndex];
+ float secondZ = data.mSphereZ[secondIndex];
+ float secondW = data.mSphereW[secondIndex];
+
+ float axisX = (secondX - firstX) * 0.5f;
+ float axisY = (secondY - firstY) * 0.5f;
+ float axisZ = (secondZ - firstZ) * 0.5f;
+ float axisW = (secondW - firstW) * 0.5f;
+
+ float sqrAxisLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+ float sqrConeLength = sqrAxisLength - axisW * axisW;
+
+ float invAxisLength = rsqrtf(sqrAxisLength);
+ float invConeLength = rsqrtf(sqrConeLength);
+
+ if(sqrConeLength <= 0.0f)
+ invAxisLength = invConeLength = 0.0f;
+
+ float axisLength = sqrAxisLength * invAxisLength;
+
+ data.mConeCenterX[threadIdx.x] = (secondX + firstX) * 0.5f;
+ data.mConeCenterY[threadIdx.x] = (secondY + firstY) * 0.5f;
+ data.mConeCenterZ[threadIdx.x] = (secondZ + firstZ) * 0.5f;
+ data.mConeRadius[threadIdx.x] = (axisW + firstW) * invConeLength * axisLength;
+
+ data.mConeAxisX[threadIdx.x] = axisX * invAxisLength;
+ data.mConeAxisY[threadIdx.x] = axisY * invAxisLength;
+ data.mConeAxisZ[threadIdx.x] = axisZ * invAxisLength;
+ data.mConeSlope[threadIdx.x] = axisW * invConeLength;
+
+ float sine = axisW * invAxisLength;
+ data.mConeSqrCosine[threadIdx.x] = 1 - sine * sine;
+ data.mConeHalfLength[threadIdx.x] = axisLength;
+ }
+
+ __syncthreads();
+}
+}
+
+__device__ CuCollision::CuCollision(Pointer<Shared, uint32_t> scratchPtr)
+{
+ int32_t numCapsules2 = 2 * gClothData.mNumCapsules;
+ int32_t numCapsules4 = 4 * gClothData.mNumCapsules;
+ int32_t numConvexes = gClothData.mNumConvexes;
+
+ if(threadIdx.x < 3)
+ {
+ (&mCapsuleIndices)[threadIdx.x] = scratchPtr + threadIdx.x * numCapsules2;
+ (&mShapeGrid)[-14 * int32_t(threadIdx.x)] = scratchPtr + numCapsules4 + numConvexes;
+ }
+
+ Pointer<Shared, uint32_t> indexPtr = scratchPtr + threadIdx.x;
+ if(threadIdx.x < numCapsules2)
+ {
+ uint32_t index = (&gClothData.mCapsuleIndices->first)[threadIdx.x];
+ *indexPtr = index;
+
+ volatile uint32_t* maskPtr = generic(indexPtr + numCapsules2);
+ *maskPtr = 1u << index;
+ *maskPtr |= maskPtr[-int32_t(threadIdx.x & 1)];
+ }
+ indexPtr += numCapsules4;
+
+ if(threadIdx.x < numConvexes)
+ *indexPtr = gClothData.mConvexMasks[threadIdx.x];
+
+ if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f)
+ {
+ allocate(mPrevData);
+
+ __syncthreads(); // mPrevData raw hazard
+
+ generateSpheres(mPrevData, 0.0f);
+ generateCones(mPrevData, mCapsuleIndices + 2 * threadIdx.x);
+ }
+
+ allocate(mCurData); // also initializes mShapeGrid (!)
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::operator()(CurrentT& current, PreviousT& previous, float alpha)
+{
+ // if(current.w > 0) current.w = previous.w (see SwSolverKernel::computeBounds())
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ if(current(i, 3) > 0.0f)
+ current(i, 3) = previous(i, 3);
+ }
+
+ collideConvexes(current, previous, alpha);
+ collideTriangles(current, alpha);
+
+ if(buildAcceleration(current, alpha))
+ {
+ if(gClothData.mEnableContinuousCollision)
+ collideContinuousCapsules(current, previous);
+ else
+ collideCapsules(current, previous);
+
+ collideVirtualCapsules(current, previous);
+ }
+
+ // sync otherwise first threads overwrite sphere data before
+ // remaining ones have had a chance to use it leading to incorrect
+ // velocity calculation for friction / ccd
+
+ __syncthreads();
+
+ if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f)
+ {
+ // store current collision data for next iteration
+ Pointer<Shared, float> dstIt = mPrevData.mSphereX + threadIdx.x;
+ Pointer<Shared, const float> srcIt = mCurData.mSphereX + threadIdx.x;
+ for(; dstIt < mCurData.mSphereX; dstIt += blockDim.x, srcIt += blockDim.x)
+ *dstIt = *srcIt;
+ }
+
+ // __syncthreads() called in updateSleepState()
+}
+
+// build per-axis mask arrays of spheres on the right/left of grid cell
+__device__ void CuCollision::buildSphereAcceleration(const CollisionData& data)
+{
+ if(threadIdx.x >= 192)
+ return;
+
+ int32_t sphereIdx = threadIdx.x & 31;
+ int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z)
+ int32_t signi = threadIdx.x << 26 & 0x80000000; // sign bit (min or max)
+
+ float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi));
+ float pos = signf * data.mSphereW[sphereIdx] + data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx];
+
+ // use overflow so we can test for non-positive
+ uint32_t index = signi - uint32_t(floorf(pos * mGridScale[axisIdx] + mGridBias[axisIdx]));
+
+ axisIdx += (uint32_t(signi) >> 31) * 3;
+ Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx;
+ // #pragma unroll
+ for(int32_t i = 0; i < sGridSize; ++i, ++index)
+ dst[i] |= __ballot(int32_t(index) <= 0);
+}
+
+// generate cone masks from sphere masks
+__device__ void CuCollision::buildConeAcceleration()
+{
+ if(threadIdx.x >= 192)
+ return;
+
+ int32_t coneIdx = threadIdx.x & 31;
+
+ uint32_t sphereMask =
+ mCurData.mConeRadius[coneIdx] && coneIdx < gClothData.mNumCapsules ? mCapsuleMasks[2 * coneIdx + 1] : 0;
+
+ int32_t offset = threadIdx.x / 32 * sGridSize;
+ Pointer<Shared, uint32_t> src = mShapeGrid + offset;
+ Pointer<Shared, uint32_t> dst = src + 6 * sGridSize;
+
+ // #pragma unroll
+ for(int32_t i = 0; i < sGridSize; ++i)
+ dst[i] |= __ballot(src[i] & sphereMask);
+}
+
+// convert right/left mask arrays into single overlap array
+__device__ void CuCollision::mergeAcceleration()
+{
+ if(threadIdx.x < sGridSize * 12)
+ {
+ Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x;
+ if(!(gClothData.mEnableContinuousCollision || threadIdx.x * 43 & 1024))
+ *dst &= dst[sGridSize * 3]; // above is same as 'threadIdx.x/24 & 1'
+
+ // mask garbage bits from build*Acceleration
+ int32_t shapeIdx = threadIdx.x >= sGridSize * 6; // spheres=0, cones=1
+ *dst &= (1 << (&gClothData.mNumSpheres)[shapeIdx]) - 1;
+ }
+}
+
+namespace
+{
+#if __CUDA_ARCH__ >= 300
+__device__ float mergeBounds(Pointer<Shared, float> buffer)
+{
+ float value = *buffer;
+ value = max(value, __shfl_down(value, 1));
+ value = max(value, __shfl_down(value, 2));
+ value = max(value, __shfl_down(value, 4));
+ value = max(value, __shfl_down(value, 8));
+ return max(value, __shfl_down(value, 16));
+}
+#else
+__device__ float mergeBounds(Pointer<Shared, float> buffer)
+{
+ // ensure that writes to buffer are visible to all threads
+ __threadfence_block();
+
+ volatile float* ptr = generic(buffer);
+ *ptr = max(*ptr, ptr[16]);
+ *ptr = max(*ptr, ptr[8]);
+ *ptr = max(*ptr, ptr[4]);
+ *ptr = max(*ptr, ptr[2]);
+ return max(*ptr, ptr[1]);
+}
+#endif
+// computes maxX, -minX, maxY, ... with a stride of 32, threadIdx.x must be < 192
+__device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer)
+{
+ assert(threadIdx.x < 192);
+
+ int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index
+ int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z)
+ int32_t signi = threadIdx.x << 26; // sign bit (min or max)
+ float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi));
+
+ *buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx];
+
+ return mergeBounds(buffer);
+}
+
+#if __CUDA_ARCH__ >= 300
+template <typename CurrentT>
+__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer)
+{
+ int32_t numThreadsPerAxis = blockDim.x * 342 >> 10 & ~31; // same as / 3
+ int32_t axis = (threadIdx.x >= numThreadsPerAxis) + (threadIdx.x >= 2 * numThreadsPerAxis);
+ int32_t threadIdxInAxis = threadIdx.x - axis * numThreadsPerAxis;
+ int laneIdx = threadIdx.x & 31;
+
+ if(threadIdxInAxis < numThreadsPerAxis)
+ {
+ typename CurrentT::ConstPointerType posIt = current[axis];
+ int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1);
+ float minX = posIt[i], maxX = minX;
+ while(i += numThreadsPerAxis, i < gClothData.mNumParticles)
+ {
+ float posX = posIt[i];
+ minX = min(minX, posX);
+ maxX = max(maxX, posX);
+ }
+
+ minX = min(minX, __shfl_down(minX, 1));
+ maxX = max(maxX, __shfl_down(maxX, 1));
+ minX = min(minX, __shfl_down(minX, 2));
+ maxX = max(maxX, __shfl_down(maxX, 2));
+ minX = min(minX, __shfl_down(minX, 4));
+ maxX = max(maxX, __shfl_down(maxX, 4));
+ minX = min(minX, __shfl_down(minX, 8));
+ maxX = max(maxX, __shfl_down(maxX, 8));
+ minX = min(minX, __shfl_down(minX, 16));
+ maxX = max(maxX, __shfl_down(maxX, 16));
+
+ if(!laneIdx)
+ {
+ Pointer<Shared, float> dst = buffer - threadIdx.x + (threadIdxInAxis >> 5) + (axis << 6);
+ dst[0] = maxX;
+ dst[32] = -minX;
+ }
+ }
+
+ __syncthreads();
+
+ if(threadIdx.x >= 192)
+ return 0.0f;
+
+ float value = *buffer;
+ if(laneIdx >= (numThreadsPerAxis >> 5))
+ value = -FLT_MAX;
+
+ // blockDim.x <= 3*512, increase to 3*1024 by adding a shfl by 16
+ assert(numThreadsPerAxis <= 16 * 32);
+
+ value = max(value, __shfl_down(value, 1));
+ value = max(value, __shfl_down(value, 2));
+ value = max(value, __shfl_down(value, 4));
+ return max(value, __shfl_down(value, 8));
+}
+#else
+template <typename CurrentT>
+__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer)
+{
+ if(threadIdx.x >= 192)
+ return 0.0f;
+
+ int32_t axisIdx = threadIdx.x >> 6; // x, y, or z
+ int32_t signi = threadIdx.x << 26; // sign bit (min or max)
+ float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi));
+
+ typename CurrentT::ConstPointerType pIt = current[axisIdx];
+ typename CurrentT::ConstPointerType pEnd = pIt + gClothData.mNumParticles;
+ pIt += min(threadIdx.x & 31, gClothData.mNumParticles - 1);
+
+ *buffer = *pIt * signf;
+ while(pIt += 32, pIt < pEnd)
+ *buffer = max(*buffer, *pIt * signf);
+
+ return mergeBounds(buffer);
+}
+#endif
+}
+
+// build mask of spheres/cones touching a regular grid along each axis
+template <typename CurrentT>
+__device__ bool CuCollision::buildAcceleration(const CurrentT& current, float alpha)
+{
+ // use still unused cone data as buffer for bounds computation
+ Pointer<Shared, float> buffer = mCurData.mConeCenterX + threadIdx.x;
+ float curParticleBounds = computeParticleBounds(current, buffer);
+ int32_t warpIdx = threadIdx.x >> 5;
+
+ if(!gClothData.mNumSpheres)
+ {
+ if(threadIdx.x < 192 && !(threadIdx.x & 31))
+ gFrameData.mParticleBounds[warpIdx] = curParticleBounds;
+ return false;
+ }
+
+ generateSpheres(mCurData, alpha);
+
+ if(threadIdx.x < 192)
+ {
+ float sphereBounds = computeSphereBounds(mCurData, buffer);
+ float particleBounds = curParticleBounds;
+ if(gClothData.mEnableContinuousCollision)
+ {
+ sphereBounds = max(sphereBounds, computeSphereBounds(mPrevData, buffer));
+ float prevParticleBounds = gFrameData.mParticleBounds[warpIdx];
+ particleBounds = max(particleBounds, prevParticleBounds);
+ }
+
+ float bounds = min(sphereBounds, particleBounds);
+ float expandedBounds = bounds + abs(bounds) * 1e-4f;
+
+ // store bounds data in shared memory
+ if(!(threadIdx.x & 31))
+ {
+ mGridScale[warpIdx] = expandedBounds;
+ gFrameData.mParticleBounds[warpIdx] = curParticleBounds;
+ }
+ }
+
+ __syncthreads(); // mGridScale raw hazard
+
+ if(threadIdx.x < 3)
+ {
+ float negativeLower = mGridScale[threadIdx.x * 2 + 1];
+ float edgeLength = mGridScale[threadIdx.x * 2] + negativeLower;
+ float divisor = max(edgeLength, FLT_EPSILON);
+ mGridScale[threadIdx.x] = __fdividef(sGridSize - 1e-3, divisor);
+ mGridBias[threadIdx.x] = negativeLower * mGridScale[threadIdx.x];
+ if(edgeLength < 0.0f)
+ mGridScale[0] = 0.0f; // mark empty intersection
+ }
+
+ // initialize sphere *and* cone grid to 0
+ if(threadIdx.x < 2 * 6 * sGridSize)
+ mShapeGrid[threadIdx.x] = 0;
+
+ __syncthreads(); // mGridScale raw hazard
+
+ // generate cones even if test below fails because
+ // continuous collision might need it in next iteration
+ generateCones(mCurData, mCapsuleIndices + 2 * threadIdx.x);
+
+ if(mGridScale[0] == 0.0f)
+ return false; // early out for empty intersection
+
+ if(gClothData.mEnableContinuousCollision)
+ buildSphereAcceleration(mPrevData);
+ buildSphereAcceleration(mCurData);
+ __syncthreads(); // mCurData raw hazard
+
+ buildConeAcceleration();
+ __syncthreads(); // mShapeGrid raw hazard
+
+ mergeAcceleration();
+ __syncthreads(); // mShapeGrid raw hazard
+
+ return true;
+}
+
+__device__ CuCollision::ShapeMask CuCollision::readShapeMask(const float& position,
+ Pointer<Shared, const uint32_t> sphereGrid)
+{
+ ShapeMask result;
+ int32_t index = int32_t(floorf(position));
+ uint32_t outMask = (index < sGridSize) - 1;
+
+ Pointer<Shared, const uint32_t> gridPtr = sphereGrid + (index & sGridSize - 1);
+ result.mSpheres = gridPtr[0] & ~outMask;
+ result.mCones = gridPtr[sGridSize * 6] & ~outMask;
+
+ return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename CurPos>
+__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const CurPos& positions) const
+{
+ ShapeMask result;
+
+ result = readShapeMask(positions.x * mGridScale[0] + mGridBias[0], mShapeGrid);
+ result &= readShapeMask(positions.y * mGridScale[1] + mGridBias[1], mShapeGrid + 8);
+ result &= readShapeMask(positions.z * mGridScale[2] + mGridBias[2], mShapeGrid + 16);
+
+ return result;
+}
+
+template <typename PrevPos, typename CurPos>
+__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const PrevPos& prevPos, const CurPos& curPos) const
+{
+ ShapeMask result;
+
+ float prevX = prevPos.x * mGridScale[0] + mGridBias[0];
+ float prevY = prevPos.y * mGridScale[1] + mGridBias[1];
+ float prevZ = prevPos.z * mGridScale[2] + mGridBias[2];
+
+ float curX = curPos.x * mGridScale[0] + mGridBias[0];
+ float curY = curPos.y * mGridScale[1] + mGridBias[1];
+ float curZ = curPos.z * mGridScale[2] + mGridBias[2];
+
+ float maxX = min(max(prevX, curX), 7.0f);
+ float maxY = min(max(prevY, curY), 7.0f);
+ float maxZ = min(max(prevZ, curZ), 7.0f);
+
+ result = readShapeMask(maxX, mShapeGrid);
+ result &= readShapeMask(maxY, mShapeGrid + 8);
+ result &= readShapeMask(maxZ, mShapeGrid + 16);
+
+ float minX = max(min(prevX, curX), 0.0f);
+ float minY = max(min(prevY, curY), 0.0f);
+ float minZ = max(min(prevZ, curZ), 0.0f);
+
+ result &= readShapeMask(minX, mShapeGrid + 24);
+ result &= readShapeMask(minY, mShapeGrid + 32);
+ result &= readShapeMask(minZ, mShapeGrid + 40);
+
+ return result;
+}
+
+template <typename CurPos>
+__device__ int32_t CuCollision::collideCapsules(const CurPos& positions, float3& delta, float3& velocity) const
+{
+ ShapeMask shapeMask = getShapeMask(positions);
+
+ delta.x = delta.y = delta.z = 0.0f;
+ velocity.x = velocity.y = velocity.z = 0.0f;
+
+ int32_t numCollisions = 0;
+
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+
+ // cone collision
+ for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1)
+ {
+ int32_t j = __ffs(shapeMask.mCones) - 1;
+
+ float deltaX = positions.x - mCurData.mConeCenterX[j];
+ float deltaY = positions.y - mCurData.mConeCenterY[j];
+ float deltaZ = positions.z - mCurData.mConeCenterZ[j];
+
+ float axisX = mCurData.mConeAxisX[j];
+ float axisY = mCurData.mConeAxisY[j];
+ float axisZ = mCurData.mConeAxisZ[j];
+ float slope = mCurData.mConeSlope[j];
+
+ float dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+ float radius = max(dot * slope + mCurData.mConeRadius[j], 0.0f);
+ float sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
+
+ Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j;
+ uint32_t bothMask = mIt[1];
+
+ if(sqrDistance > radius * radius)
+ {
+ shapeMask.mSpheres &= ~bothMask;
+ continue;
+ }
+
+ sqrDistance = max(sqrDistance, FLT_EPSILON);
+ float invDistance = rsqrtf(sqrDistance);
+
+ float base = dot + slope * sqrDistance * invDistance;
+
+ float halfLength = mCurData.mConeHalfLength[j];
+ uint32_t leftMask = base < -halfLength;
+ uint32_t rightMask = base > halfLength;
+
+ uint32_t firstMask = mIt[0];
+ uint32_t secondMask = firstMask ^ bothMask;
+
+ shapeMask.mSpheres &= ~(firstMask & leftMask - 1);
+ shapeMask.mSpheres &= ~(secondMask & rightMask - 1);
+
+ if(!leftMask && !rightMask)
+ {
+ deltaX = deltaX - base * axisX;
+ deltaY = deltaY - base * axisY;
+ deltaZ = deltaZ - base * axisZ;
+
+ float sqrCosine = mCurData.mConeSqrCosine[j];
+ float scale = radius * invDistance * sqrCosine - sqrCosine;
+
+ delta.x = delta.x + deltaX * scale;
+ delta.y = delta.y + deltaY * scale;
+ delta.z = delta.z + deltaZ * scale;
+
+ if(frictionEnabled)
+ {
+ int32_t s0 = mCapsuleIndices[2 * j];
+ int32_t s1 = mCapsuleIndices[2 * j + 1];
+
+ // load previous sphere pos
+ float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0];
+ float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0];
+ float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0];
+
+ float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1];
+ float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1];
+ float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1];
+
+ // interpolate velocity between the two spheres
+ float t = dot * 0.5f + 0.5f;
+
+ velocity.x += s0vx + t * (s1vx - s0vx);
+ velocity.y += s0vy + t * (s1vy - s0vy);
+ velocity.z += s0vz + t * (s1vz - s0vz);
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ // sphere collision
+ for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1)
+ {
+ int32_t j = __ffs(shapeMask.mSpheres) - 1;
+
+ float deltaX = positions.x - mCurData.mSphereX[j];
+ float deltaY = positions.y - mCurData.mSphereY[j];
+ float deltaZ = positions.z - mCurData.mSphereZ[j];
+
+ float sqrDistance = FLT_EPSILON + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+ float relDistance = rsqrtf(sqrDistance) * mCurData.mSphereW[j];
+
+ if(relDistance > 1.0f)
+ {
+ float scale = relDistance - 1.0f;
+
+ delta.x = delta.x + deltaX * scale;
+ delta.y = delta.y + deltaY * scale;
+ delta.z = delta.z + deltaZ * scale;
+
+ if(frictionEnabled)
+ {
+ velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j];
+ velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j];
+ velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j];
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ return numCollisions;
+}
+
+static const __device__ float gSkeletonWidth = (1 - 0.2f) * (1 - 0.2f) - 1;
+
+template <typename PrevPos, typename CurPos>
+__device__ int32_t
+CuCollision::collideCapsules(const PrevPos& prevPos, CurPos& curPos, float3& delta, float3& velocity) const
+{
+ ShapeMask shapeMask = getShapeMask(prevPos, curPos);
+
+ delta.x = delta.y = delta.z = 0.0f;
+ velocity.x = velocity.y = velocity.z = 0.0f;
+
+ int32_t numCollisions = 0;
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+
+ // cone collision
+ for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1)
+ {
+ int32_t j = __ffs(shapeMask.mCones) - 1;
+
+ float prevAxisX = mPrevData.mConeAxisX[j];
+ float prevAxisY = mPrevData.mConeAxisY[j];
+ float prevAxisZ = mPrevData.mConeAxisZ[j];
+ float prevSlope = mPrevData.mConeSlope[j];
+
+ float prevX = prevPos.x - mPrevData.mConeCenterX[j];
+ float prevY = prevPos.y - mPrevData.mConeCenterY[j];
+ float prevZ = prevPos.z - mPrevData.mConeCenterZ[j];
+ float prevT = prevY * prevAxisZ - prevZ * prevAxisY;
+ float prevU = prevZ * prevAxisX - prevX * prevAxisZ;
+ float prevV = prevX * prevAxisY - prevY * prevAxisX;
+ float prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+ float prevRadius = max(prevDot * prevSlope + mCurData.mConeRadius[j], 0.0f);
+
+ float curAxisX = mCurData.mConeAxisX[j];
+ float curAxisY = mCurData.mConeAxisY[j];
+ float curAxisZ = mCurData.mConeAxisZ[j];
+ float curSlope = mCurData.mConeSlope[j];
+
+ float curX = curPos.x - mCurData.mConeCenterX[j];
+ float curY = curPos.y - mCurData.mConeCenterY[j];
+ float curZ = curPos.z - mCurData.mConeCenterZ[j];
+ float curT = curY * curAxisZ - curZ * curAxisY;
+ float curU = curZ * curAxisX - curX * curAxisZ;
+ float curV = curX * curAxisY - curY * curAxisX;
+ float curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+ float curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f);
+
+ float curSqrDistance = FLT_EPSILON + curT * curT + curU * curU + curV * curV;
+
+ float dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
+ float dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
+ float dotCurCur = curSqrDistance - curRadius * curRadius;
+
+ float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+ float sqrtD = sqrtf(discriminant);
+ float halfB = dotPrevCur - dotPrevPrev;
+ float minusA = dotPrevCur - dotCurCur + halfB;
+
+ // time of impact or 0 if prevPos inside cone
+ float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA);
+ bool hasCollision = toi < 1.0f && halfB < sqrtD;
+
+ // skip continuous collision if the (un-clamped) particle
+ // trajectory only touches the outer skin of the cone.
+ float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+ hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth);
+
+ // a is negative when one cone is contained in the other,
+ // which is already handled by discrete collision.
+ hasCollision = hasCollision && minusA < -FLT_EPSILON;
+
+ if(hasCollision)
+ {
+ float deltaX = prevX - curX;
+ float deltaY = prevY - curY;
+ float deltaZ = prevZ - curZ;
+
+ // interpolate delta at toi
+ float posX = prevX - deltaX * toi;
+ float posY = prevY - deltaY * toi;
+ float posZ = prevZ - deltaZ * toi;
+
+ float curHalfLength = mCurData.mConeHalfLength[j];
+ float curScaledAxisX = curAxisX * curHalfLength;
+ float curScaledAxisY = curAxisY * curHalfLength;
+ float curScaledAxisZ = curAxisZ * curHalfLength;
+
+ float prevHalfLength = mPrevData.mConeHalfLength[j];
+ float deltaScaledAxisX = curScaledAxisX - prevAxisX * prevHalfLength;
+ float deltaScaledAxisY = curScaledAxisY - prevAxisY * prevHalfLength;
+ float deltaScaledAxisZ = curScaledAxisZ - prevAxisZ * prevHalfLength;
+
+ float oneMinusToi = 1.0f - toi;
+
+ // interpolate axis at toi
+ float axisX = curScaledAxisX - deltaScaledAxisX * oneMinusToi;
+ float axisY = curScaledAxisY - deltaScaledAxisY * oneMinusToi;
+ float axisZ = curScaledAxisZ - deltaScaledAxisZ * oneMinusToi;
+ float slope = prevSlope * oneMinusToi + curSlope * toi;
+
+ float sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+ float invHalfLength = rsqrtf(sqrHalfLength);
+ float dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
+
+ float sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
+ float invDistance = sqrDistance > 0.0f ? rsqrtf(sqrDistance) : 0.0f;
+
+ float base = dot + slope * sqrDistance * invDistance;
+ float scale = base * invHalfLength;
+
+ if(abs(scale) < 1.0f)
+ {
+ deltaX = deltaX + deltaScaledAxisX * scale;
+ deltaY = deltaY + deltaScaledAxisY * scale;
+ deltaZ = deltaZ + deltaScaledAxisZ * scale;
+
+ // reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+ // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+ float minusK = __fdividef(sqrtD, minusA * oneMinusToi);
+ oneMinusToi = __fdividef(oneMinusToi, 1 - minusK);
+
+ curX = curX + deltaX * oneMinusToi;
+ curY = curY + deltaY * oneMinusToi;
+ curZ = curZ + deltaZ * oneMinusToi;
+
+ curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+ curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f);
+ curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
+
+ curPos.x = mCurData.mConeCenterX[j] + curX;
+ curPos.y = mCurData.mConeCenterY[j] + curY;
+ curPos.z = mCurData.mConeCenterZ[j] + curZ;
+ }
+ }
+
+ // curPos inside cone (discrete collision)
+ bool hasContact = curRadius * curRadius > curSqrDistance;
+
+ Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j;
+ uint32_t bothMask = mIt[1];
+
+ uint32_t cullMask = bothMask & (hasCollision | hasContact) - 1;
+ shapeMask.mSpheres &= ~cullMask;
+
+ if(!hasContact)
+ continue;
+
+ float invDistance = curSqrDistance > 0.0f ? rsqrtf(curSqrDistance) : 0.0f;
+ float base = curDot + curSlope * curSqrDistance * invDistance;
+
+ float halfLength = mCurData.mConeHalfLength[j];
+ uint32_t leftMask = base < -halfLength;
+ uint32_t rightMask = base > halfLength;
+
+ // can only skip continuous sphere collision if post-ccd position
+ // is on code side *and* particle had cone-ccd collision.
+ uint32_t firstMask = mIt[0];
+ uint32_t secondMask = firstMask ^ bothMask;
+ cullMask = (firstMask & leftMask - 1) | (secondMask & rightMask - 1);
+ shapeMask.mSpheres &= ~cullMask | hasCollision - 1;
+
+ if(!leftMask && !rightMask)
+ {
+ float deltaX = curX - base * curAxisX;
+ float deltaY = curY - base * curAxisY;
+ float deltaZ = curZ - base * curAxisZ;
+
+ float sqrCosine = mCurData.mConeSqrCosine[j];
+ float scale = curRadius * invDistance * sqrCosine - sqrCosine;
+
+ delta.x = delta.x + deltaX * scale;
+ delta.y = delta.y + deltaY * scale;
+ delta.z = delta.z + deltaZ * scale;
+
+ if(frictionEnabled)
+ {
+ int32_t s0 = mCapsuleIndices[2 * j];
+ int32_t s1 = mCapsuleIndices[2 * j + 1];
+
+ // load previous sphere pos
+ float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0];
+ float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0];
+ float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0];
+
+ float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1];
+ float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1];
+ float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1];
+
+ // interpolate velocity between the two spheres
+ float t = curDot * 0.5f + 0.5f;
+
+ velocity.x += s0vx + t * (s1vx - s0vx);
+ velocity.y += s0vy + t * (s1vy - s0vy);
+ velocity.z += s0vz + t * (s1vz - s0vz);
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ // sphere collision
+ for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1)
+ {
+ int32_t j = __ffs(shapeMask.mSpheres) - 1;
+
+ float prevX = prevPos.x - mPrevData.mSphereX[j];
+ float prevY = prevPos.y - mPrevData.mSphereY[j];
+ float prevZ = prevPos.z - mPrevData.mSphereZ[j];
+ float prevRadius = mPrevData.mSphereW[j];
+
+ float curX = curPos.x - mCurData.mSphereX[j];
+ float curY = curPos.y - mCurData.mSphereY[j];
+ float curZ = curPos.z - mCurData.mSphereZ[j];
+ float curRadius = mCurData.mSphereW[j];
+
+ float sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ;
+
+ float dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius;
+ float dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius;
+ float dotCurCur = sqrDistance - curRadius * curRadius;
+
+ float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+ float sqrtD = sqrtf(discriminant);
+ float halfB = dotPrevCur - dotPrevPrev;
+ float minusA = dotPrevCur - dotCurCur + halfB;
+
+ // time of impact or 0 if prevPos inside sphere
+ float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA);
+ bool hasCollision = toi < 1.0f && halfB < sqrtD;
+
+ // skip continuous collision if the (un-clamped) particle
+ // trajectory only touches the outer skin of the cone.
+ float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+ hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth);
+
+ // a is negative when one cone is contained in the other,
+ // which is already handled by discrete collision.
+ hasCollision = hasCollision && minusA < -FLT_EPSILON;
+
+ if(hasCollision)
+ {
+ float deltaX = prevX - curX;
+ float deltaY = prevY - curY;
+ float deltaZ = prevZ - curZ;
+
+ float oneMinusToi = 1.0f - toi;
+
+ // reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+ // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+ float minusK = __fdividef(sqrtD, minusA * oneMinusToi);
+ oneMinusToi = __fdividef(oneMinusToi, 1 - minusK);
+
+ curX = curX + deltaX * oneMinusToi;
+ curY = curY + deltaY * oneMinusToi;
+ curZ = curZ + deltaZ * oneMinusToi;
+
+ curPos.x = mCurData.mSphereX[j] + curX;
+ curPos.y = mCurData.mSphereY[j] + curY;
+ curPos.z = mCurData.mSphereZ[j] + curZ;
+
+ sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ;
+ }
+
+ float relDistance = rsqrtf(sqrDistance) * curRadius;
+
+ if(relDistance > 1.0f)
+ {
+ float scale = relDistance - 1.0f;
+
+ delta.x = delta.x + curX * scale;
+ delta.y = delta.y + curY * scale;
+ delta.z = delta.z + curZ * scale;
+
+ if(frictionEnabled)
+ {
+ velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j];
+ velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j];
+ velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j];
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ return numCollisions;
+}
+
+namespace
+{
+template <typename PrevPos, typename CurPos>
+__device__ inline float3 calcFrictionImpulse(const PrevPos& prevPos, const CurPos& curPos, const float3& shapeVelocity,
+ float scale, const float3& collisionImpulse)
+{
+ const float frictionScale = gClothData.mFrictionScale;
+
+ // calculate collision normal
+ float deltaSq = collisionImpulse.x * collisionImpulse.x + collisionImpulse.y * collisionImpulse.y +
+ collisionImpulse.z * collisionImpulse.z;
+
+ float rcpDelta = rsqrtf(deltaSq + FLT_EPSILON);
+
+ float nx = collisionImpulse.x * rcpDelta;
+ float ny = collisionImpulse.y * rcpDelta;
+ float nz = collisionImpulse.z * rcpDelta;
+
+ // calculate relative velocity scaled by number of collision
+ float rvx = curPos.x - prevPos.x - shapeVelocity.x * scale;
+ float rvy = curPos.y - prevPos.y - shapeVelocity.y * scale;
+ float rvz = curPos.z - prevPos.z - shapeVelocity.z * scale;
+
+ // calculate magnitude of relative normal velocity
+ float rvn = rvx * nx + rvy * ny + rvz * nz;
+
+ // calculate relative tangential velocity
+ float rvtx = rvx - rvn * nx;
+ float rvty = rvy - rvn * ny;
+ float rvtz = rvz - rvn * nz;
+
+ // calculate magnitude of vt
+ float rcpVt = rsqrtf(rvtx * rvtx + rvty * rvty + rvtz * rvtz + FLT_EPSILON);
+
+ // magnitude of friction impulse (cannot be larger than -|vt|)
+ float j = max(-frictionScale * deltaSq * rcpDelta * scale * rcpVt, -1.0f);
+
+ return make_float3(rvtx * j, rvty * j, rvtz * j);
+}
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideCapsules(CurrentT& current, PreviousT& previous) const
+{
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+ bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ typename CurrentT::VectorType curPos = current(i);
+
+ float3 delta, velocity;
+ if(int32_t numCollisions = collideCapsules(curPos, delta, velocity))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+
+ if(frictionEnabled)
+ {
+ typename PreviousT::VectorType prevPos = previous(i);
+ float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta);
+
+ prevPos.x -= frictionImpulse.x;
+ prevPos.y -= frictionImpulse.y;
+ prevPos.z -= frictionImpulse.z;
+
+ previous(i) = prevPos;
+ }
+
+ curPos.x += delta.x * scale;
+ curPos.y += delta.y * scale;
+ curPos.z += delta.z * scale;
+
+ current(i) = curPos;
+
+ if(massScaleEnabled)
+ {
+ float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
+ float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq;
+ current(i, 3) = __fdividef(current(i, 3), massScale);
+ }
+ }
+ }
+}
+
+namespace
+{
+template <typename PointerT>
+__device__ float lerp(PointerT pos, const int4& indices, const float4& weights)
+{
+ return pos[indices.x] * weights.x + pos[indices.y] * weights.y + pos[indices.z] * weights.z;
+}
+
+template <typename PointerT>
+__device__ void apply(PointerT pos, const int4& indices, const float4& weights, float delta)
+{
+ pos[indices.x] += delta * weights.x;
+ pos[indices.y] += delta * weights.y;
+ pos[indices.z] += delta * weights.z;
+}
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideVirtualCapsules(CurrentT& current, PreviousT& previous) const
+{
+ const uint32_t* __restrict setSizeIt = gClothData.mVirtualParticleSetSizesBegin;
+
+ if(!setSizeIt)
+ return;
+
+ if(gClothData.mEnableContinuousCollision)
+ {
+ // copied from mergeAcceleration
+ Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x;
+ if(!(threadIdx.x * 43 & 1024) && threadIdx.x < sGridSize * 12)
+ *dst &= dst[sGridSize * 3];
+ __syncthreads(); // mShapeGrid raw hazard
+ }
+
+ const uint32_t* __restrict setSizeEnd = gClothData.mVirtualParticleSetSizesEnd;
+ const uint16_t* __restrict indicesEnd = gClothData.mVirtualParticleIndices;
+ const float4* __restrict weightsIt = reinterpret_cast<const float4*>(gClothData.mVirtualParticleWeights);
+
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+ bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+ for(; setSizeIt != setSizeEnd; ++setSizeIt)
+ {
+ __syncthreads();
+
+ const uint16_t* __restrict indicesIt = indicesEnd + threadIdx.x * 4;
+ for(indicesEnd += *setSizeIt * 4; indicesIt < indicesEnd; indicesIt += blockDim.x * 4)
+ {
+ int4 indices = make_int4(indicesIt[0], indicesIt[1], indicesIt[2], indicesIt[3]);
+
+ float4 weights = weightsIt[indices.w];
+
+ float3 curPos;
+ curPos.x = lerp(current[0], indices, weights);
+ curPos.y = lerp(current[1], indices, weights);
+ curPos.z = lerp(current[2], indices, weights);
+
+ float3 delta, velocity;
+ if(int32_t numCollisions = collideCapsules(curPos, delta, velocity))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+ float wscale = weights.w * scale;
+
+ apply(current[0], indices, weights, delta.x * wscale);
+ apply(current[1], indices, weights, delta.y * wscale);
+ apply(current[2], indices, weights, delta.z * wscale);
+
+ if(frictionEnabled)
+ {
+ float3 prevPos;
+ prevPos.x = lerp(previous[0], indices, weights);
+ prevPos.y = lerp(previous[1], indices, weights);
+ prevPos.z = lerp(previous[2], indices, weights);
+
+ float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta);
+
+ apply(previous[0], indices, weights, frictionImpulse.x * -weights.w);
+ apply(previous[1], indices, weights, frictionImpulse.y * -weights.w);
+ apply(previous[2], indices, weights, frictionImpulse.z * -weights.w);
+ }
+
+ if(massScaleEnabled)
+ {
+ float deltaLengthSq = (delta.x * delta.x + delta.y * delta.y + delta.z * delta.z) * scale * scale;
+ float invMassScale = __fdividef(1.0f, 1.0f + gClothData.mCollisionMassScale * deltaLengthSq);
+
+ // not multiplying by weights[3] here because unlike applying velocity
+ // deltas where we want the interpolated position to obtain a particular
+ // value, we instead just require that the total change is equal to invMassScale
+ invMassScale = invMassScale - 1.0f;
+ current(indices.x, 3) *= 1.0f + weights.x * invMassScale;
+ current(indices.y, 3) *= 1.0f + weights.y * invMassScale;
+ current(indices.z, 3) *= 1.0f + weights.z * invMassScale;
+ }
+ }
+ }
+ }
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideContinuousCapsules(CurrentT& current, PreviousT& previous) const
+{
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+ bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ typename PreviousT::VectorType prevPos = previous(i);
+ typename CurrentT::VectorType curPos = current(i);
+
+ float3 delta, velocity;
+ if(int32_t numCollisions = collideCapsules(prevPos, curPos, delta, velocity))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+
+ if(frictionEnabled)
+ {
+ float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta);
+
+ prevPos.x -= frictionImpulse.x;
+ prevPos.y -= frictionImpulse.y;
+ prevPos.z -= frictionImpulse.z;
+
+ previous(i) = prevPos;
+ }
+
+ curPos.x += delta.x * scale;
+ curPos.y += delta.y * scale;
+ curPos.z += delta.z * scale;
+
+ current(i) = curPos;
+
+ if(massScaleEnabled)
+ {
+ float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
+ float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq;
+ current(i, 3) = __fdividef(current(i, 3), massScale);
+ }
+ }
+ }
+}
+
+template <typename CurPos>
+__device__ int32_t CuCollision::collideConvexes(const CurPos& positions, float3& delta) const
+{
+ delta.x = delta.y = delta.z = 0.0f;
+
+ Pointer<Shared, const float> planeX = mCurData.mSphereX;
+ Pointer<Shared, const float> planeY = planeX + gClothData.mNumPlanes;
+ Pointer<Shared, const float> planeZ = planeY + gClothData.mNumPlanes;
+ Pointer<Shared, const float> planeW = planeZ + gClothData.mNumPlanes;
+
+ int32_t numCollisions = 0;
+ Pointer<Shared, const uint32_t> cIt = mConvexMasks;
+ Pointer<Shared, const uint32_t> cEnd = cIt + gClothData.mNumConvexes;
+ for(; cIt != cEnd; ++cIt)
+ {
+ uint32_t mask = *cIt;
+
+ int32_t maxIndex = __ffs(mask) - 1;
+ float maxDist = planeW[maxIndex] + positions.z * planeZ[maxIndex] + positions.y * planeY[maxIndex] +
+ positions.x * planeX[maxIndex];
+
+ while((maxDist < 0.0f) && (mask &= mask - 1))
+ {
+ int32_t i = __ffs(mask) - 1;
+ float dist = planeW[i] + positions.z * planeZ[i] + positions.y * planeY[i] + positions.x * planeX[i];
+ if(dist > maxDist)
+ maxDist = dist, maxIndex = i;
+ }
+
+ if(maxDist < 0.0f)
+ {
+ delta.x -= planeX[maxIndex] * maxDist;
+ delta.y -= planeY[maxIndex] * maxDist;
+ delta.z -= planeZ[maxIndex] * maxDist;
+
+ ++numCollisions;
+ }
+ }
+
+ return numCollisions;
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideConvexes(CurrentT& current, PreviousT& previous, float alpha)
+{
+ if(!gClothData.mNumConvexes)
+ return;
+
+ // interpolate planes and transpose
+ if(threadIdx.x < gClothData.mNumPlanes * 4)
+ {
+ float start = gFrameData.mStartCollisionPlanes[threadIdx.x];
+ float target = gFrameData.mTargetCollisionPlanes[threadIdx.x];
+ int32_t j = threadIdx.x % 4 * gClothData.mNumPlanes + threadIdx.x / 4;
+ mCurData.mSphereX[j] = start + (target - start) * alpha;
+ }
+
+ __syncthreads();
+
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ typename CurrentT::VectorType curPos = current(i);
+
+ float3 delta;
+ if(int32_t numCollisions = collideConvexes(curPos, delta))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+
+ if(frictionEnabled)
+ {
+ typename PreviousT::VectorType prevPos = previous(i);
+
+ float3 frictionImpulse =
+ calcFrictionImpulse(prevPos, curPos, make_float3(0.0f, 0.0f, 0.0f), scale, delta);
+
+ prevPos.x -= frictionImpulse.x;
+ prevPos.y -= frictionImpulse.y;
+ prevPos.z -= frictionImpulse.z;
+
+ previous(i) = prevPos;
+ }
+
+ curPos.x += delta.x * scale;
+ curPos.y += delta.y * scale;
+ curPos.z += delta.z * scale;
+
+ current(i) = curPos;
+ }
+ }
+
+ __syncthreads();
+}
+
+namespace
+{
+struct TriangleData
+{
+ float baseX, baseY, baseZ;
+ float edge0X, edge0Y, edge0Z;
+ float edge1X, edge1Y, edge1Z;
+ float normalX, normalY, normalZ;
+
+ float edge0DotEdge1;
+ float edge0SqrLength;
+ float edge1SqrLength;
+
+ float det;
+ float denom;
+
+ float edge0InvSqrLength;
+ float edge1InvSqrLength;
+
+ // initialize struct after vertices have been stored in first 9 members
+ __device__ void initialize()
+ {
+ edge0X -= baseX, edge0Y -= baseY, edge0Z -= baseZ;
+ edge1X -= baseX, edge1Y -= baseY, edge1Z -= baseZ;
+
+ normalX = edge0Y * edge1Z - edge0Z * edge1Y;
+ normalY = edge0Z * edge1X - edge0X * edge1Z;
+ normalZ = edge0X * edge1Y - edge0Y * edge1X;
+
+ float normalInvLength = rsqrtf(normalX * normalX + normalY * normalY + normalZ * normalZ);
+ normalX *= normalInvLength;
+ normalY *= normalInvLength;
+ normalZ *= normalInvLength;
+
+ edge0DotEdge1 = edge0X * edge1X + edge0Y * edge1Y + edge0Z * edge1Z;
+ edge0SqrLength = edge0X * edge0X + edge0Y * edge0Y + edge0Z * edge0Z;
+ edge1SqrLength = edge1X * edge1X + edge1Y * edge1Y + edge1Z * edge1Z;
+
+ det = __fdividef(1.0f, edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1);
+ denom = __fdividef(1.0f, edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1);
+
+ edge0InvSqrLength = __fdividef(1.0f, edge0SqrLength);
+ edge1InvSqrLength = __fdividef(1.0f, edge1SqrLength);
+ }
+};
+}
+
+template <typename CurrentT>
+__device__ void CuCollision::collideTriangles(CurrentT& current, int32_t i)
+{
+ float posX = current(i, 0);
+ float posY = current(i, 1);
+ float posZ = current(i, 2);
+
+ const TriangleData* __restrict tIt = reinterpret_cast<const TriangleData*>(generic(mCurData.mSphereX));
+ const TriangleData* __restrict tEnd = tIt + gClothData.mNumCollisionTriangles;
+
+ float normalX, normalY, normalZ, normalD = 0.0f;
+ float minSqrLength = FLT_MAX;
+
+ for(; tIt != tEnd; ++tIt)
+ {
+ float dx = posX - tIt->baseX;
+ float dy = posY - tIt->baseY;
+ float dz = posZ - tIt->baseZ;
+
+ float deltaDotEdge0 = dx * tIt->edge0X + dy * tIt->edge0Y + dz * tIt->edge0Z;
+ float deltaDotEdge1 = dx * tIt->edge1X + dy * tIt->edge1Y + dz * tIt->edge1Z;
+ float deltaDotNormal = dx * tIt->normalX + dy * tIt->normalY + dz * tIt->normalZ;
+
+ float s = tIt->edge1SqrLength * deltaDotEdge0 - tIt->edge0DotEdge1 * deltaDotEdge1;
+ float t = tIt->edge0SqrLength * deltaDotEdge1 - tIt->edge0DotEdge1 * deltaDotEdge0;
+
+ s = t > 0.0f ? s * tIt->det : deltaDotEdge0 * tIt->edge0InvSqrLength;
+ t = s > 0.0f ? t * tIt->det : deltaDotEdge1 * tIt->edge1InvSqrLength;
+
+ if(s + t > 1.0f)
+ {
+ s = (tIt->edge1SqrLength - tIt->edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1) * tIt->denom;
+ }
+
+ s = fmaxf(0.0f, fminf(1.0f, s));
+ t = fmaxf(0.0f, fminf(1.0f - s, t));
+
+ dx = dx - tIt->edge0X * s - tIt->edge1X * t;
+ dy = dy - tIt->edge0Y * s - tIt->edge1Y * t;
+ dz = dz - tIt->edge0Z * s - tIt->edge1Z * t;
+
+ float sqrLength = dx * dx + dy * dy + dz * dz;
+
+ if(0.0f > deltaDotNormal)
+ sqrLength *= 1.0001f;
+
+ if(sqrLength < minSqrLength)
+ {
+ normalX = tIt->normalX;
+ normalY = tIt->normalY;
+ normalZ = tIt->normalZ;
+ normalD = deltaDotNormal;
+ minSqrLength = sqrLength;
+ }
+ }
+
+ if(normalD < 0.0f)
+ {
+ current(i, 0) = posX - normalX * normalD;
+ current(i, 1) = posY - normalY * normalD;
+ current(i, 2) = posZ - normalZ * normalD;
+ }
+}
+
+namespace
+{
+static const int32_t sTrianglePadding = sizeof(TriangleData) / sizeof(float) - 9;
+}
+
+template <typename CurrentT>
+__device__ void CuCollision::collideTriangles(CurrentT& current, float alpha)
+{
+ if(!gClothData.mNumCollisionTriangles)
+ return;
+
+ // interpolate triangle vertices and store in shared memory
+ for(int32_t i = threadIdx.x, n = gClothData.mNumCollisionTriangles * 9; i < n; i += blockDim.x)
+ {
+ float start = gFrameData.mStartCollisionTriangles[i];
+ float target = gFrameData.mTargetCollisionTriangles[i];
+ int32_t idx = i * 7282 >> 16; // same as i/9
+ int32_t offset = i + idx * sTrianglePadding;
+ mCurData.mSphereX[offset] = start + (target - start) * alpha;
+ }
+
+ __syncthreads();
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumCollisionTriangles; i += blockDim.x)
+ {
+ reinterpret_cast<TriangleData*>(generic(mCurData.mSphereX))[i].initialize();
+ }
+
+ __syncthreads();
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ collideTriangles(current, i);
+
+ __syncthreads();
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp
new file mode 100644
index 00000000..2ccc3db9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp
@@ -0,0 +1,54 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "cudamanager/PxCudaContextManager.h"
+#include "CuContextLock.h"
+#include "CuFactory.h"
+
+using namespace physx;
+
+cloth::CuContextLock::CuContextLock(const CuFactory& factory) : mFactory(factory)
+{
+ acquire();
+}
+
+cloth::CuContextLock::~CuContextLock()
+{
+ release();
+}
+
+void cloth::CuContextLock::acquire()
+{
+ mFactory.mContextManager->acquireContext();
+}
+
+void cloth::CuContextLock::release()
+{
+ mFactory.mContextManager->releaseContext();
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h
new file mode 100644
index 00000000..50e48b49
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h
@@ -0,0 +1,57 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class CuFactory;
+
+// acquires cuda context for the lifetime of the instance
+class CuContextLock
+{
+ protected:
+ CuContextLock(const CuContextLock&);
+ CuContextLock& operator=(const CuContextLock&);
+
+ public:
+ CuContextLock(const CuFactory&);
+ ~CuContextLock();
+
+ void acquire();
+ void release();
+
+ const CuFactory& mFactory;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h
new file mode 100644
index 00000000..cb37b39d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h
@@ -0,0 +1,216 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <cuda.h>
+#include "CuCheckSuccess.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+template <typename T>
+struct RemoveConst
+{
+ typedef T Type;
+};
+template <typename T>
+struct RemoveConst<const T>
+{
+ typedef T Type;
+};
+
+template <typename>
+class CuDeviceReference; // forward declare
+
+// pointer to POD type in CUDA device memory
+template <typename T>
+class CuDevicePointer
+{
+ template <typename>
+ friend class CuDevicePointer;
+
+ typedef typename RemoveConst<T>::Type ValueType;
+
+ public:
+ // c'tors
+ CuDevicePointer() : mPointer(0)
+ {
+ }
+ template <class U>
+ explicit CuDevicePointer(U* ptr)
+ : mPointer(ptr)
+ {
+ }
+ CuDevicePointer(const CuDevicePointer<ValueType>& ptr) : mPointer(ptr.get())
+ {
+ }
+
+ // conversion
+ template <typename U>
+ operator CuDevicePointer<U>(void) const
+ {
+ return CuDevicePointer<U>(static_cast<U*>(mPointer));
+ }
+ T* get() const
+ {
+ return mPointer;
+ }
+ CUdeviceptr dev() const
+ {
+ return reinterpret_cast<CUdeviceptr>(mPointer);
+ }
+
+ // operators
+ CuDevicePointer operator+(const ptrdiff_t& rhs) const
+ {
+ return CuDevicePointer(mPointer + rhs);
+ }
+ CuDevicePointer operator-(const ptrdiff_t& rhs) const
+ {
+ return CuDevicePointer(mPointer - rhs);
+ }
+ CuDevicePointer& operator++(void)
+ {
+ ++mPointer;
+ return *this;
+ }
+ CuDevicePointer operator++(int)
+ {
+ CuDevicePointer copy(*this);
+ ++(*this);
+ return copy;
+ }
+ CuDevicePointer& operator--(void)
+ {
+ --mPointer;
+ return *this;
+ }
+ CuDevicePointer operator--(int)
+ {
+ CuDevicePointer copy(*this);
+ --(*this);
+ return copy;
+ }
+ CuDevicePointer& operator+=(ptrdiff_t rhs)
+ {
+ mPointer += rhs;
+ return *this;
+ }
+ CuDevicePointer& operator-=(ptrdiff_t rhs)
+ {
+ mPointer -= rhs;
+ return *this;
+ }
+ ptrdiff_t operator-(const CuDevicePointer& rhs) const
+ {
+ return mPointer - rhs.mPointer;
+ }
+
+ template <typename U>
+ bool operator==(const CuDevicePointer<U>& other) const
+ {
+ return mPointer == other.mPointer;
+ }
+ template <typename U>
+ bool operator!=(const CuDevicePointer<U>& other) const
+ {
+ return mPointer != other.mPointer;
+ }
+
+ // dereference
+ CuDeviceReference<T> operator[](const ptrdiff_t&) const; // (implemented below)
+ CuDeviceReference<T> operator*(void) const
+ {
+ return operator[](0);
+ }
+
+ private:
+ T* mPointer;
+};
+
+template <typename T>
+class CuDeviceReference
+{
+ template <typename>
+ friend class CuDeviceReference;
+ template <typename>
+ friend class CuDevicePointer;
+
+ typedef typename RemoveConst<T>::Type ValueType;
+
+ template <typename U>
+ CuDeviceReference(CuDevicePointer<U> pointer)
+ : mPointer(static_cast<T*>(pointer.get()))
+ {
+ }
+
+ public:
+ template <typename U>
+ CuDeviceReference(CuDeviceReference<U> reference)
+ : mPointer(static_cast<T*>(reference.mPointer))
+ {
+ }
+
+ CuDevicePointer<T> operator&() const
+ {
+ return CuDevicePointer<T>(mPointer);
+ }
+
+ CuDeviceReference& operator=(const T& v)
+ {
+ checkSuccess(cuMemcpyHtoD(CUdeviceptr(mPointer), &v, sizeof(T)));
+ return *this;
+ }
+ CuDeviceReference& operator=(const CuDeviceReference& ref)
+ {
+ checkSuccess(cuMemcpyDtoD(CUdeviceptr(mPointer), CUdeviceptr(ref.mPointer), sizeof(T)));
+ return *this;
+ }
+ operator ValueType() const
+ {
+ ValueType result;
+ checkSuccess(cuMemcpyDtoH(&result, CUdeviceptr(mPointer), sizeof(T)));
+ return result;
+ }
+
+ private:
+ T* mPointer;
+};
+}
+
+template <typename T>
+cloth::CuDeviceReference<T> cloth::CuDevicePointer<T>::operator[](const ptrdiff_t& i) const
+{
+ return CuDeviceReference<T>(*this + i);
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h
new file mode 100644
index 00000000..e3997d26
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h
@@ -0,0 +1,258 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxMath.h" // for swap
+#include "cudamanager/PxCudaMemoryManager.h"
+#include "cudamanager/PxCudaContextManager.h"
+#include "CuDevicePointer.h"
+#include "PsArray.h"
+#include "PsUtilities.h"
+
+namespace physx
+{
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch
+#endif
+
+namespace cloth
+{
+
+// STL-style vector that holds POD types in CUDA device memory. The interface
+// is not complete, add whatever you need from the std::vector interface.
+template <typename T>
+class CuDeviceVector
+{
+ public:
+ typedef CuDevicePointer<T> iterator;
+ typedef CuDevicePointer<const T> const_iterator;
+
+ CuDeviceVector(physx::PxCudaContextManager* ctx) : mManager(0)
+ {
+ PX_ASSERT(ctx);
+
+ if(ctx)
+ mManager = ctx->getMemoryManager();
+ }
+
+ CuDeviceVector(const CuDeviceVector& other) : mManager(other.getMemoryManager())
+ {
+ PX_ASSERT(mManager);
+
+ operator=(other);
+ }
+
+ CuDeviceVector(physx::PxCudaContextManager* ctx, const T* first, const T* last) : mManager(0)
+ {
+ PX_ASSERT(ctx);
+
+ if(ctx)
+ {
+ mManager = ctx->getMemoryManager();
+ assign(first, last);
+ }
+ }
+
+ template <typename Alloc>
+ CuDeviceVector(const shdfnd::Array<T, Alloc>& other)
+ {
+ operator=(other);
+ }
+
+ ~CuDeviceVector()
+ {
+ PX_ASSERT(mManager);
+
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, mFirst.dev());
+ }
+
+ CuDeviceVector& operator=(const CuDeviceVector& other)
+ {
+ resize(other.size());
+ checkSuccess(cuMemcpyDtoD(mFirst.dev(), other.mFirst.dev(), other.size() * sizeof(T)));
+ return *this;
+ }
+
+ template <typename Alloc>
+ CuDeviceVector& operator=(const shdfnd::Array<T, Alloc>& other)
+ {
+ const T* first = other.empty() ? 0 : &other.front();
+ assign(first, first + other.size());
+ return *this;
+ }
+
+ bool empty() const
+ {
+ return mLast == mFirst;
+ }
+ size_t size() const
+ {
+ return size_t(mLast - mFirst);
+ }
+ size_t capacity() const
+ {
+ return mEnd - mFirst;
+ }
+
+ iterator begin()
+ {
+ return mFirst;
+ }
+ iterator end()
+ {
+ return mLast;
+ }
+ const_iterator begin() const
+ {
+ return mFirst;
+ }
+ const_iterator end() const
+ {
+ return mLast;
+ }
+
+ void push_back(const T& v)
+ {
+ if(mLast == mEnd)
+ reserve(PxMax<size_t>(1, capacity() * 2));
+
+ *mLast++ = v;
+ }
+
+ void push_back(const T* first, const T* last)
+ {
+ if(mEnd - mLast < last - first)
+ reserve(PxMax<size_t>(2 * capacity(), mLast - mFirst + last - first));
+
+ if(first != last)
+ checkSuccess(cuMemcpyHtoD(mLast.dev(), first, sizeof(T) * (last - first)));
+
+ mLast += last - first;
+ }
+
+ void erase(iterator it)
+ {
+ size_t byteSize = (mLast - it - 1) * sizeof(T);
+ if(byteSize)
+ {
+ CUdeviceptr tmp = 0, dst = it.dev();
+
+ PX_ASSERT(mManager);
+
+ tmp = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, byteSize,
+ PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH));
+ checkSuccess(cuMemcpyDtoD(tmp, dst + sizeof(T), byteSize));
+ checkSuccess(cuMemcpyDtoD(dst, tmp, byteSize));
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, tmp);
+ }
+ --mLast;
+ }
+
+ void reserve(size_t n)
+ {
+ if(n <= capacity())
+ return;
+
+ CUdeviceptr newFirst = 0, oldFirst = mFirst.dev();
+
+ PX_ASSERT(mManager);
+
+ newFirst = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, sizeof(T) * n,
+ PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH));
+ checkSuccess(cuMemcpyDtoD(newFirst, oldFirst, sizeof(T) * size()));
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, oldFirst);
+
+ iterator first(reinterpret_cast<T*>(newFirst));
+ mEnd = first + n;
+ mLast = first + size();
+ mFirst = first;
+ }
+
+ void resize(size_t n)
+ {
+ if(capacity() < n)
+ reserve(PxMax(n, capacity() * 2));
+
+ mLast = mFirst + n;
+ }
+
+ void assign(const T* first, const T* last)
+ {
+ size_t n = last - first;
+ resize(n);
+ checkSuccess(cuMemcpyHtoD(mFirst.dev(), first, n * sizeof(T)));
+ }
+
+ void swap(CuDeviceVector& other)
+ {
+ shdfnd::swap(mFirst, other.mFirst);
+ shdfnd::swap(mLast, other.mLast);
+ shdfnd::swap(mEnd, other.mEnd);
+ }
+
+ // match PxArray interface
+ void remove(size_t i)
+ {
+ erase(begin() + i);
+ }
+ void pushBack(const T& v)
+ {
+ push_back(v);
+ }
+
+ physx::PxCudaMemoryManager* getMemoryManager() const
+ {
+ return mManager;
+ }
+
+ private:
+ iterator mFirst, mLast, mEnd;
+ physx::PxCudaMemoryManager* mManager;
+};
+
+} // namespace cloth
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename T>
+void swap(physx::cloth::CuDeviceVector<T>& first, physx::cloth::CuDeviceVector<T>& second)
+{
+ first.swap(second);
+}
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp
new file mode 100644
index 00000000..7f8326fe
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp
@@ -0,0 +1,197 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuFabric.h"
+#include "CuContextLock.h"
+#include "CuFactory.h"
+
+#if PX_VC
+#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch
+#endif
+
+using namespace physx;
+using namespace shdfnd;
+
+cloth::CuTether::CuTether(uint16_t anchor, uint16_t length) : mAnchor(anchor), mLength(length)
+{
+}
+
+cloth::CuFabric::CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases,
+ Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices,
+ Range<const uint32_t> anchors, Range<const float> tetherLengths,
+ Range<const uint32_t> triangles, uint32_t id)
+: CuContextLock(factory)
+, mFactory(factory)
+, mNumParticles(numParticles)
+, mPhases(mFactory.mContextManager, phases.begin(), phases.end())
+, mSets(mFactory.mContextManager)
+, mRestvalues(mFactory.mContextManager, restvalues.begin(), restvalues.end())
+, mIndices(mFactory.mContextManager)
+, mTethers(mFactory.mContextManager)
+, mTriangles(mFactory.mContextManager)
+, mId(id)
+{
+ // should no longer be prefixed with 0
+ PX_ASSERT(sets.front() != 0);
+
+ PX_ASSERT(sets.back() == restvalues.size());
+ PX_ASSERT(restvalues.size() * 2 == indices.size());
+ PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end()));
+
+ // copy to device, add leading zero
+ mSets.reserve(sets.size() + 1);
+ mSets.push_back(0);
+ mSets.push_back(sets.begin(), sets.end());
+
+ // manually convert uint32_t indices to uint16_t in temp memory
+ Vector<uint16_t>::Type hostIndices;
+ hostIndices.resizeUninitialized(indices.size());
+ Vector<uint16_t>::Type::Iterator dIt = hostIndices.begin();
+
+ const uint32_t* it = indices.begin();
+ const uint32_t* end = indices.end();
+ for(; it != end; ++it, ++dIt)
+ *dIt = uint16_t(*it);
+
+ // copy to device vector in one go
+ mIndices.assign(hostIndices.begin(), hostIndices.end());
+
+ // gather data per phase
+ mNumConstraintsInPhase.reserve(phases.size());
+ CuDevicePointer<const float> devRestvalues = mRestvalues.begin();
+ CuDevicePointer<const uint16_t> devIndices = mIndices.begin();
+ for(const uint32_t* pIt = phases.begin(); pIt != phases.end(); ++pIt)
+ {
+ uint32_t setIndex = *pIt;
+ uint32_t firstIndex = setIndex ? sets[setIndex - 1] : 0;
+ uint32_t lastIndex = sets[setIndex];
+ mNumConstraintsInPhase.pushBack(lastIndex - firstIndex);
+ mRestvaluesInPhase.pushBack(devRestvalues + firstIndex);
+ mIndicesInPhase.pushBack(devIndices + 2 * firstIndex);
+ }
+
+ // tethers
+ PX_ASSERT(anchors.size() == tetherLengths.size());
+ mTetherLengthScale =
+ tetherLengths.empty() ? 1.0f : *maxElement(tetherLengths.begin(), tetherLengths.end()) / USHRT_MAX;
+ float inverseScale = 1 / (mTetherLengthScale + FLT_EPSILON);
+ Vector<CuTether>::Type tethers;
+ tethers.reserve(anchors.size());
+ for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront())
+ {
+ tethers.pushBack(CuTether(uint16_t(anchors.front()), uint16_t(tetherLengths.front() * inverseScale + 0.5f)));
+ }
+ mTethers.assign(tethers.begin(), tethers.end());
+
+ // triangles
+ hostIndices.resizeUninitialized(triangles.size());
+ dIt = hostIndices.begin();
+
+ it = triangles.begin();
+ end = triangles.end();
+ for(; it != end; ++it, ++dIt)
+ *dIt = uint16_t(*it);
+
+ mTriangles.assign(hostIndices.begin(), hostIndices.end());
+
+ CuContextLock::release();
+
+ // add to factory
+ mFactory.mFabrics.pushBack(this);
+}
+
+cloth::CuFabric::~CuFabric()
+{
+ CuContextLock::acquire();
+
+ Vector<CuFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this);
+
+ PX_ASSERT(fIt != mFactory.mFabrics.end());
+ mFactory.mFabrics.replaceWithLast(fIt);
+}
+
+cloth::Factory& physx::cloth::CuFabric::getFactory() const
+{
+ return mFactory;
+}
+
+uint32_t cloth::CuFabric::getNumPhases() const
+{
+ return uint32_t(mPhases.size());
+}
+
+uint32_t cloth::CuFabric::getNumRestvalues() const
+{
+ return uint32_t(mRestvalues.size());
+}
+
+uint32_t cloth::CuFabric::getNumSets() const
+{
+ return uint32_t(mSets.size() - 1);
+}
+
+uint32_t cloth::CuFabric::getNumIndices() const
+{
+ return uint32_t(mIndices.size());
+}
+
+uint32_t cloth::CuFabric::getNumParticles() const
+{
+ return mNumParticles;
+}
+
+uint32_t physx::cloth::CuFabric::getNumTethers() const
+{
+ return uint32_t(mTethers.size());
+}
+
+uint32_t physx::cloth::CuFabric::getNumTriangles() const
+{
+ return uint32_t(mTriangles.size()) / 3;
+}
+
+void physx::cloth::CuFabric::scaleRestvalues(float scale)
+{
+ CuContextLock contextLock(mFactory);
+
+ Vector<float>::Type restvalues(uint32_t(mRestvalues.size()));
+ mFactory.copyToHost(mRestvalues.begin().get(), mRestvalues.end().get(), restvalues.begin());
+
+ Vector<float>::Type::Iterator rIt, rEnd = restvalues.end();
+ for(rIt = restvalues.begin(); rIt != rEnd; ++rIt)
+ *rIt *= scale;
+
+ mRestvalues = restvalues;
+}
+
+void physx::cloth::CuFabric::scaleTetherLengths(float scale)
+{
+ // cloth instances won't pick this up until CuClothData is dirty!
+ mTetherLengthScale *= scale;
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h
new file mode 100644
index 00000000..93f787f8
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h
@@ -0,0 +1,102 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Fabric.h"
+#include "Range.h"
+#include "Types.h"
+#include "Allocator.h"
+#include "CuContextLock.h"
+#include "CuDeviceVector.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+struct CuTether
+{
+ CuTether(uint16_t, uint16_t);
+ uint16_t mAnchor;
+ uint16_t mLength;
+};
+
+class CuFabric : public UserAllocated, private CuContextLock, public Fabric
+{
+ PX_NOCOPY(CuFabric)
+ public:
+ CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+ Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors,
+ Range<const float> tetherLengths, Range<const uint32_t> triangles, uint32_t id);
+
+ virtual ~CuFabric();
+
+ virtual Factory& getFactory() const;
+
+ virtual uint32_t getNumPhases() const;
+ virtual uint32_t getNumRestvalues() const;
+
+ virtual uint32_t getNumSets() const;
+ virtual uint32_t getNumIndices() const;
+
+ virtual uint32_t getNumParticles() const;
+
+ virtual uint32_t getNumTethers() const;
+
+ virtual uint32_t getNumTriangles() const;
+
+ virtual void scaleRestvalues(float);
+ virtual void scaleTetherLengths(float);
+
+ public:
+ CuFactory& mFactory;
+
+ uint32_t mNumParticles;
+
+ CuDeviceVector<uint32_t> mPhases; // index of set to use
+ CuDeviceVector<uint32_t> mSets; // offset of first restvalue, with 0 prefix
+
+ CuDeviceVector<float> mRestvalues;
+ CuDeviceVector<uint16_t> mIndices;
+
+ CuDeviceVector<CuTether> mTethers;
+ float mTetherLengthScale;
+
+ CuDeviceVector<uint16_t> mTriangles;
+
+ Vector<uint32_t>::Type mNumConstraintsInPhase;
+ Vector<CuDevicePointer<const float> >::Type mRestvaluesInPhase;
+ Vector<CuDevicePointer<const uint16_t> >::Type mIndicesInPhase;
+
+ uint32_t mId;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp
new file mode 100644
index 00000000..8847780e
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp
@@ -0,0 +1,398 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMemory.h"
+#include "CuFactory.h"
+#include "CuFabric.h"
+#include "CuCloth.h"
+#include "CuSolver.h"
+#include "ClothImpl.h"
+#include "CuCheckSuccess.h"
+#include "CuContextLock.h"
+#include "PsAllocator.h"
+#include "Array.h"
+#include "PsFoundation.h"
+#include <cuda.h>
+
+#if PX_VC
+#pragma warning(disable : 4061 4062) // enumerator 'identifier' in switch of enum 'enumeration' is not handled
+#endif
+
+using namespace physx;
+using namespace shdfnd;
+
+namespace physx
+{
+namespace cloth
+{
+// defined in Factory.cpp
+uint32_t getNextFabricId();
+
+typedef Vec4T<uint32_t> Vec4u;
+}
+}
+
+void cloth::checkSuccessImpl(CUresult err, const char* file, const int line)
+{
+ if(err != CUDA_SUCCESS)
+ {
+ const char* code = "Unknown";
+ switch(err)
+ {
+#define ADD_CASE(X) \
+ case X: \
+ code = #X; \
+ break
+ ADD_CASE(CUDA_ERROR_INVALID_VALUE);
+ ADD_CASE(CUDA_ERROR_OUT_OF_MEMORY);
+ ADD_CASE(CUDA_ERROR_NOT_INITIALIZED);
+ ADD_CASE(CUDA_ERROR_DEINITIALIZED);
+ ADD_CASE(CUDA_ERROR_NO_DEVICE);
+ ADD_CASE(CUDA_ERROR_INVALID_DEVICE);
+ ADD_CASE(CUDA_ERROR_INVALID_IMAGE);
+ ADD_CASE(CUDA_ERROR_INVALID_CONTEXT);
+ ADD_CASE(CUDA_ERROR_MAP_FAILED);
+ ADD_CASE(CUDA_ERROR_UNMAP_FAILED);
+ ADD_CASE(CUDA_ERROR_ARRAY_IS_MAPPED);
+ ADD_CASE(CUDA_ERROR_ALREADY_MAPPED);
+ ADD_CASE(CUDA_ERROR_NO_BINARY_FOR_GPU);
+ ADD_CASE(CUDA_ERROR_ALREADY_ACQUIRED);
+ ADD_CASE(CUDA_ERROR_NOT_MAPPED);
+ ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY);
+ ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_POINTER);
+ ADD_CASE(CUDA_ERROR_ECC_UNCORRECTABLE);
+ ADD_CASE(CUDA_ERROR_UNSUPPORTED_LIMIT);
+ ADD_CASE(CUDA_ERROR_INVALID_SOURCE);
+ ADD_CASE(CUDA_ERROR_FILE_NOT_FOUND);
+ ADD_CASE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND);
+ ADD_CASE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED);
+ ADD_CASE(CUDA_ERROR_OPERATING_SYSTEM);
+ ADD_CASE(CUDA_ERROR_INVALID_HANDLE);
+ ADD_CASE(CUDA_ERROR_NOT_FOUND);
+ ADD_CASE(CUDA_ERROR_NOT_READY);
+ ADD_CASE(CUDA_ERROR_LAUNCH_FAILED);
+ ADD_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
+ ADD_CASE(CUDA_ERROR_LAUNCH_TIMEOUT);
+ ADD_CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
+ default:
+ ADD_CASE(CUDA_ERROR_UNKNOWN);
+#undef ADD_CASE
+ }
+ shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, file, line, "CUDA error: %s", code);
+ }
+}
+
+namespace
+{
+// returns max threads as specified by launch bounds in CuSolverKernel.cu
+uint32_t getMaxThreadsPerBlock(const physx::PxCudaContextManager& contextManager)
+{
+ if(contextManager.supportsArchSM30())
+ return 1024;
+
+ if(contextManager.supportsArchSM20())
+ return 512;
+
+ return 192;
+}
+}
+
+cloth::CuFactory::CuFactory(physx::PxCudaContextManager* contextManager)
+: Factory(CUDA)
+, mContextManager(contextManager)
+, mNumThreadsPerBlock(getMaxThreadsPerBlock(*contextManager))
+, mMaxThreadsPerBlock(mNumThreadsPerBlock)
+{
+}
+
+cloth::CuFactory::~CuFactory()
+{
+}
+
+cloth::Fabric* cloth::CuFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases,
+ Range<const uint32_t> sets, Range<const float> restvalues,
+ Range<const uint32_t> indices, Range<const uint32_t> anchors,
+ Range<const float> tetherLengths, Range<const uint32_t> triangles)
+{
+ return new CuFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths, triangles,
+ getNextFabricId());
+}
+
+cloth::Cloth* cloth::CuFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
+{
+ return new CuClothImpl(*this, fabric, particles);
+}
+
+cloth::Solver* cloth::CuFactory::createSolver(physx::PxTaskManager*)
+{
+ CuSolver* solver = new CuSolver(*this);
+
+ if(solver->hasError())
+ {
+ delete solver;
+ return NULL;
+ }
+
+ return solver;
+}
+
+// CuFactory::clone() implemented in CuClothClone.cpp
+
+void cloth::CuFactory::copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const
+{
+ CuContextLock contextLock(*this);
+
+ checkSuccess(cuMemcpyDtoH(dstIt, CUdeviceptr(srcIt), size_t(intptr_t(srcEnd) - intptr_t(srcIt))));
+}
+
+void cloth::CuFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+ Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+ Range<float> tetherLengths, Range<uint32_t> triangles) const
+{
+ CuContextLock contextLock(*this);
+
+ const CuFabric& cuFabric = static_cast<const CuFabric&>(fabric);
+
+ if(!phases.empty())
+ {
+ PX_ASSERT(phases.size() == cuFabric.mPhases.size());
+ const uint32_t* devicePhases = cuFabric.mPhases.begin().get();
+ copyToHost(devicePhases, devicePhases + cuFabric.mPhases.size(), phases.begin());
+ }
+
+ if(!restvalues.empty())
+ {
+ PX_ASSERT(restvalues.size() == cuFabric.mRestvalues.size());
+ const float* deviceRestvalues = cuFabric.mRestvalues.begin().get();
+ copyToHost(deviceRestvalues, deviceRestvalues + cuFabric.mRestvalues.size(), restvalues.begin());
+ }
+
+ if(!sets.empty())
+ {
+ PX_ASSERT(sets.size() == cuFabric.mSets.size() - 1);
+ const uint32_t* deviceSets = cuFabric.mSets.begin().get();
+ copyToHost(deviceSets + 1, deviceSets + cuFabric.mSets.size(), sets.begin());
+ }
+
+ if(!indices.empty())
+ {
+ PX_ASSERT(indices.size() == cuFabric.mIndices.size());
+ const uint16_t* deviceIndices = cuFabric.mIndices.begin().get();
+ uint16_t* hostIndices = reinterpret_cast<uint16_t*>(indices.begin());
+ copyToHost(deviceIndices, deviceIndices + cuFabric.mIndices.size(), hostIndices);
+
+ // convert from 16bit to 32bit indices
+ for(uint32_t i = indices.size(); 0 < i--;)
+ indices[i] = hostIndices[i];
+ }
+
+ if(!anchors.empty() || !tetherLengths.empty())
+ {
+ uint32_t numTethers = uint32_t(cuFabric.mTethers.size());
+ Vector<CuTether>::Type tethers(numTethers, CuTether(0, 0));
+ const CuTether* deviceTethers = cuFabric.mTethers.begin().get();
+ copyToHost(deviceTethers, deviceTethers + numTethers, tethers.begin());
+
+ PX_ASSERT(anchors.empty() || anchors.size() == tethers.size());
+ for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront())
+ anchors.front() = tethers[i].mAnchor;
+
+ PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == tethers.size());
+ for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront())
+ tetherLengths.front() = tethers[i].mLength * cuFabric.mTetherLengthScale;
+ }
+
+ if(!triangles.empty())
+ {
+ // todo triangles
+ }
+}
+
+void cloth::CuFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+ Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ PX_ASSERT(spheres.empty() || spheres.size() == cuCloth.mStartCollisionSpheres.size());
+ PX_ASSERT(capsules.empty() || capsules.size() == cuCloth.mCapsuleIndices.size() * 2);
+ PX_ASSERT(planes.empty() || planes.size() == cuCloth.mStartCollisionPlanes.size());
+ PX_ASSERT(convexes.empty() || convexes.size() == cuCloth.mConvexMasks.size());
+ PX_ASSERT(triangles.empty() || triangles.size() == cuCloth.mStartCollisionTriangles.size());
+
+ // collision spheres are in pinned memory, so memcpy directly
+ if(!cuCloth.mStartCollisionSpheres.empty() && !spheres.empty())
+ memcpy(spheres.begin(), &cuCloth.mStartCollisionSpheres.front(),
+ cuCloth.mStartCollisionSpheres.size() * sizeof(PxVec4));
+
+ if(!cuCloth.mCapsuleIndices.empty() && !capsules.empty())
+ memcpy(capsules.begin(), &cuCloth.mCapsuleIndices.front(), cuCloth.mCapsuleIndices.size() * sizeof(IndexPair));
+
+ if(!cuCloth.mStartCollisionPlanes.empty() && !planes.empty())
+ memcpy(planes.begin(), &cuCloth.mStartCollisionPlanes.front(),
+ cuCloth.mStartCollisionPlanes.size() * sizeof(PxVec4));
+
+ if(!cuCloth.mConvexMasks.empty() && !convexes.empty())
+ memcpy(convexes.begin(), &cuCloth.mConvexMasks.front(), cuCloth.mConvexMasks.size() * sizeof(uint32_t));
+
+ if(!cuCloth.mStartCollisionTriangles.empty() && !triangles.empty())
+ memcpy(triangles.begin(), &cuCloth.mStartCollisionTriangles.front(),
+ cuCloth.mStartCollisionTriangles.size() * sizeof(PxVec3));
+}
+
+void cloth::CuFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(cuCloth.mMotionConstraints.mHostCopy.size())
+ {
+ PX_ASSERT(destConstraints.size() == cuCloth.mMotionConstraints.mHostCopy.size());
+
+ PxMemCopy(destConstraints.begin(), cuCloth.mMotionConstraints.mHostCopy.begin(),
+ sizeof(PxVec4) * cuCloth.mMotionConstraints.mHostCopy.size());
+ }
+ else
+ {
+ CuContextLock contextLock(*this);
+
+ CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mMotionConstraints.mTarget.empty()
+ ? cuCloth.mMotionConstraints.mTarget
+ : cuCloth.mMotionConstraints.mStart;
+
+ PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+ copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin());
+ }
+}
+
+void cloth::CuFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(cuCloth.mSeparationConstraints.mHostCopy.size())
+ {
+ PX_ASSERT(destConstraints.size() == cuCloth.mSeparationConstraints.mHostCopy.size());
+
+ PxMemCopy(destConstraints.begin(), cuCloth.mSeparationConstraints.mHostCopy.begin(),
+ sizeof(PxVec4) * cuCloth.mSeparationConstraints.mHostCopy.size());
+ }
+ else
+ {
+ CuContextLock contextLock(*this);
+
+ CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mSeparationConstraints.mTarget.empty()
+ ? cuCloth.mSeparationConstraints.mTarget
+ : cuCloth.mSeparationConstraints.mStart;
+
+ PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+ copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin());
+ }
+}
+
+void cloth::CuFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(cuCloth.mParticleAccelerationsHostCopy.size())
+ {
+ PX_ASSERT(destAccelerations.size() == cuCloth.mParticleAccelerationsHostCopy.size());
+
+ PxMemCopy(destAccelerations.begin(), cuCloth.mParticleAccelerationsHostCopy.begin(),
+ sizeof(PxVec4) * cuCloth.mParticleAccelerationsHostCopy.size());
+ }
+}
+
+void cloth::CuFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+ Range<PxVec3> destWeights) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ CuContextLock contextLock(*this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(destWeights.size() > 0)
+ {
+ uint32_t numWeights = cloth.getNumVirtualParticleWeights();
+
+ Vector<PxVec4>::Type hostWeights(numWeights, PxVec4(0.0f));
+ copyToHost(cuCloth.mVirtualParticleWeights.begin().get(), cuCloth.mVirtualParticleWeights.end().get(),
+ &hostWeights.front());
+
+ // convert weights to Vec3f
+ PxVec3* destIt = reinterpret_cast<PxVec3*>(destWeights.begin());
+ Vector<PxVec4>::Type::ConstIterator srcIt = hostWeights.begin();
+ Vector<PxVec4>::Type::ConstIterator srcEnd = srcIt + numWeights;
+ for(; srcIt != srcEnd; ++srcIt, ++destIt)
+ *destIt = reinterpret_cast<const PxVec3&>(*srcIt);
+
+ PX_ASSERT(destIt <= destWeights.end());
+ }
+
+ if(destIndices.size() > 0)
+ {
+ uint32_t numIndices = cloth.getNumVirtualParticles();
+
+ Vector<Vec4us>::Type hostIndices(numIndices);
+ copyToHost(cuCloth.mVirtualParticleIndices.begin().get(), cuCloth.mVirtualParticleIndices.end().get(),
+ &hostIndices.front());
+
+ // convert indices to 32 bit
+ Vec4u* destIt = reinterpret_cast<Vec4u*>(destIndices.begin());
+ Vector<Vec4us>::Type::ConstIterator srcIt = hostIndices.begin();
+ Vector<Vec4us>::Type::ConstIterator srcEnd = srcIt + numIndices;
+ for(; srcIt != srcEnd; ++srcIt, ++destIt)
+ *destIt = Vec4u(*srcIt);
+
+ PX_ASSERT(&array(*destIt) <= destIndices.end());
+ }
+}
+
+void cloth::CuFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const
+{
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+ PX_ASSERT(destIndices.size() == cuCloth.mSelfCollisionIndices.size());
+ copyToHost(cuCloth.mSelfCollisionIndices.begin().get(), cuCloth.mSelfCollisionIndices.end().get(),
+ destIndices.begin());
+}
+
+void cloth::CuFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const
+{
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+ PX_ASSERT(destRestPositions.size() == cuCloth.mRestPositions.size());
+ copyToHost(cuCloth.mRestPositions.begin().get(), cuCloth.mRestPositions.end().get(), destRestPositions.begin());
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h
new file mode 100644
index 00000000..e868034f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h
@@ -0,0 +1,107 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+namespace physx
+{
+class PxCudaContextManager;
+}
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class CuFabric;
+class CuCloth;
+template <typename>
+class ClothImpl;
+
+class CuFactory : public UserAllocated, public Factory
+{
+ protected:
+ CuFactory& operator=(const CuFactory&);
+
+ public:
+ typedef CuFabric FabricType;
+ typedef ClothImpl<CuCloth> ImplType;
+
+ CuFactory(physx::PxCudaContextManager*);
+ virtual ~CuFactory();
+
+ virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+ Range<const float> restvalues, Range<const uint32_t> indices,
+ Range<const uint32_t> anchors, Range<const float> tetherLengths,
+ Range<const uint32_t> triangles);
+
+ virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric);
+
+ virtual Solver* createSolver(physx::PxTaskManager* taskMgr);
+
+ virtual Cloth* clone(const Cloth& cloth);
+
+ virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+ Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+ Range<float> tetherLengths, Range<uint32_t> triangles) const;
+
+ virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+ Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const;
+
+ virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+ virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+ virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const;
+
+ virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+ Range<PxVec3> destWeights) const;
+
+ virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const;
+
+ virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const;
+
+ public:
+ void copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const;
+
+ public:
+ Vector<CuFabric*>::Type mFabrics;
+
+ physx::PxCudaContextManager* mContextManager;
+
+ uint32_t mNumThreadsPerBlock;
+
+ const uint32_t mMaxThreadsPerBlock;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h
new file mode 100644
index 00000000..74470bde
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h
@@ -0,0 +1,51 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+struct CuPhaseConfig
+{
+ float mStiffness;
+ float mStiffnessMultiplier;
+ float mCompressionLimit;
+ float mStretchLimit;
+
+ uint32_t mNumConstraints;
+ const float* mRestvalues;
+ const uint16_t* mIndices;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h
new file mode 100644
index 00000000..57dd6731
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h
@@ -0,0 +1,132 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "cudamanager/PxCudaContextManager.h"
+#include "cudamanager/PxCudaMemoryManager.h"
+#include "Allocator.h"
+#include "CuCheckSuccess.h"
+#include <cuda.h>
+
+namespace physx
+{
+
+namespace cloth
+{
+
+struct CuHostAllocator
+{
+ CuHostAllocator(physx::PxCudaContextManager* ctx = NULL, unsigned int flags = cudaHostAllocDefault)
+ : mDevicePtr(0), mFlags(flags), mManager(0)
+ {
+ PX_ASSERT(ctx);
+
+ if(ctx)
+ mManager = ctx->getMemoryManager();
+ }
+
+ void* allocate(size_t n, const char*, int)
+ {
+ physx::PxCudaBufferPtr bufferPtr;
+
+ PX_ASSERT(mManager);
+
+ if(mFlags & cudaHostAllocWriteCombined)
+ bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, n,
+ PX_ALLOC_INFO("cloth::CuHostAllocator::T_WRITE_COMBINED", CLOTH));
+ else if(mFlags & cudaHostAllocMapped)
+ bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, n,
+ PX_ALLOC_INFO("cloth::CuHostAllocator::T_PINNED_HOST", CLOTH));
+ else
+ bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_HOST, n,
+ PX_ALLOC_INFO("cloth::CuHostAllocator::T_HOST", CLOTH));
+
+ if(mFlags & cudaHostAllocMapped)
+ checkSuccess(cuMemHostGetDevicePointer(&mDevicePtr, reinterpret_cast<void*>(bufferPtr), 0));
+
+ return reinterpret_cast<void*>(bufferPtr);
+ }
+
+ void deallocate(void* p)
+ {
+ PX_ASSERT(mManager);
+
+ if(mFlags & cudaHostAllocWriteCombined)
+ mManager->free(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, physx::PxCudaBufferPtr(p));
+ else if(mFlags & cudaHostAllocMapped)
+ mManager->free(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, physx::PxCudaBufferPtr(p));
+ else
+ mManager->free(physx::PxCudaBufferMemorySpace::T_HOST, physx::PxCudaBufferPtr(p));
+
+ // don't reset mDevicePtr because Array::recreate deallocates last
+ }
+
+ CUdeviceptr mDevicePtr; // device pointer of last allocation
+ unsigned int mFlags;
+ physx::PxCudaMemoryManager* mManager;
+};
+
+template <typename T>
+CuHostAllocator getMappedAllocator(physx::PxCudaContextManager* ctx)
+{
+ return CuHostAllocator(ctx, cudaHostAllocMapped | cudaHostAllocWriteCombined);
+}
+
+template <typename T>
+struct CuPinnedVector
+{
+ // note: always use shdfnd::swap() instead of Array::swap()
+ // in order to keep cached device pointer consistent
+ typedef shdfnd::Array<T, typename physx::cloth::CuHostAllocator> Type;
+};
+
+template <typename T>
+T* getDevicePointer(shdfnd::Array<T, typename physx::cloth::CuHostAllocator>& vector)
+{
+ // cached device pointer only valid if non-empty
+ return vector.empty() ? 0 : reinterpret_cast<T*>(vector.getAllocator().mDevicePtr);
+}
+
+} // namespace cloth
+
+} // namespace physx
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename T>
+void swap(Array<T, typename physx::cloth::CuHostAllocator>& left, Array<T, typename physx::cloth::CuHostAllocator>& right)
+{
+ swap(left.getAllocator(), right.getAllocator());
+ left.swap(right);
+}
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
new file mode 100644
index 00000000..fb0fd7af
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
@@ -0,0 +1,472 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifndef CU_SOLVER_KERNEL_CU
+#error include CuSelfCollision.h only from CuSolverKernel.cu
+#endif
+
+#ifndef UINT16_MAX
+#define UINT16_MAX 0xffff
+#endif
+
+namespace
+{
+#if __CUDA_ARCH__ >= 300
+template <int>
+__device__ void scanWarp(Pointer<Shared, int32_t> counts)
+{
+ asm volatile("{"
+ " .reg .s32 tmp;"
+ " .reg .pred p;"
+ " shfl.up.b32 tmp|p, %0, 0x01, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x02, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x04, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x08, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x10, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ "}"
+ : "+r"(*generic(counts))
+ :);
+}
+#else
+template <int stride>
+__device__ void scanWarp(Pointer<Shared, int32_t> counts)
+{
+ volatile int32_t* ptr = generic(counts);
+ const int32_t laneIdx = threadIdx.x & warpSize - 1;
+ if(laneIdx >= 1)
+ *ptr += ptr[-stride];
+ if(laneIdx >= 2)
+ *ptr += ptr[-2 * stride];
+ if(laneIdx >= 4)
+ *ptr += ptr[-4 * stride];
+ if(laneIdx >= 8)
+ *ptr += ptr[-8 * stride];
+ if(laneIdx >= 16)
+ *ptr += ptr[-16 * stride];
+}
+#endif
+
+// sorts array by upper 16bits
+// [keys] must be at least 2*n in length, in/out in first n elements
+// [histogram] must be at least 34*16 = 544 in length
+__device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> histogram)
+{
+ const int32_t numWarps = blockDim.x >> 5;
+ const int32_t warpIdx = threadIdx.x >> 5;
+ const int32_t laneIdx = threadIdx.x & warpSize - 1;
+
+ const uint32_t laneMask = (1u << laneIdx) - 1;
+ const uint32_t mask1 = (threadIdx.x & 1) - 1;
+ const uint32_t mask2 = !!(threadIdx.x & 2) - 1;
+ const uint32_t mask4 = !!(threadIdx.x & 4) - 1;
+ const uint32_t mask8 = !!(threadIdx.x & 8) - 1;
+
+ const int32_t tn = (n + blockDim.x - 1) / blockDim.x;
+ const int32_t startIndex = tn * (threadIdx.x - laneIdx) + laneIdx;
+ const int32_t endIndex = min(startIndex + tn * warpSize, n + 31 & ~31); // full warps for ballot
+
+ int32_t* srcKeys = keys;
+ int32_t* dstKeys = keys + n;
+
+ Pointer<Shared, int32_t> hIt = histogram + 16 * warpIdx;
+ Pointer<Shared, int32_t> pIt = histogram + 16 * laneIdx + 16;
+ Pointer<Shared, int32_t> tIt = histogram + 16 * numWarps + laneIdx;
+
+ for(int32_t p = 16; p < 32; p += 4) // radix passes (4 bits each)
+ {
+ // gather bucket histograms per warp
+ int32_t warpCount = 0;
+ for(int32_t i = startIndex; i < endIndex; i += 32)
+ {
+ int32_t key = i < n ? srcKeys[i] >> p : 15;
+ uint32_t ballot1 = __ballot(key & 1);
+ uint32_t ballot2 = __ballot(key & 2);
+ uint32_t ballot4 = __ballot(key & 4);
+ uint32_t ballot8 = __ballot(key & 8);
+ warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
+ }
+
+ if(laneIdx >= 16)
+ hIt[laneIdx] = warpCount;
+
+ __syncthreads();
+
+ // prefix sum of histogram buckets
+ for(int32_t i = warpIdx; i < 16; i += numWarps)
+ scanWarp<16>(pIt + i);
+
+ __syncthreads();
+
+ // prefix sum of bucket totals (exclusive)
+ if(threadIdx.x < 16)
+ {
+ *tIt = tIt[-1] & !threadIdx.x - 1;
+ scanWarp<1>(tIt);
+ hIt[threadIdx.x] = 0;
+ }
+
+ __syncthreads();
+
+ if(laneIdx < 16)
+ hIt[laneIdx] += *tIt;
+
+ // split indices
+ for(int32_t i = startIndex; i < endIndex; i += 32)
+ {
+ int32_t key = i < n ? srcKeys[i] >> p : 15;
+ uint32_t ballot1 = __ballot(key & 1);
+ uint32_t ballot2 = __ballot(key & 2);
+ uint32_t ballot4 = __ballot(key & 4);
+ uint32_t ballot8 = __ballot(key & 8);
+ uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) &
+ (!!(key & 8) - 1 ^ ballot8);
+ int32_t index = hIt[key & 15] + __popc(bits & laneMask);
+
+ if(i < n)
+ dstKeys[index] = srcKeys[i];
+
+ if(laneIdx < 16)
+ hIt[laneIdx] += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
+ }
+
+ __syncthreads();
+
+ ::swap(srcKeys, dstKeys);
+ }
+
+#ifndef NDEBUG
+ for(int32_t i = threadIdx.x; i < n; i += blockDim.x)
+ assert(!i || keys[i - 1] >> 16 <= keys[i] >> 16);
+#endif
+}
+}
+
+namespace
+{
+struct CuSelfCollision
+{
+ template <typename CurrentT>
+ __device__ void operator()(CurrentT& current);
+
+ private:
+ template <typename CurrentT>
+ __device__ void buildAcceleration(const CurrentT& current);
+ template <bool useRestPositions, typename CurrentT>
+ __device__ void collideParticles(CurrentT& current) const;
+
+ public:
+ float mPosBias[3];
+ float mPosScale[3];
+ const float* mPosPtr[3];
+};
+}
+
+__shared__ uninitialized<CuSelfCollision> gSelfCollideParticles;
+
+template <typename CurrentT>
+__device__ void CuSelfCollision::operator()(CurrentT& current)
+{
+ if(min(gClothData.mSelfCollisionDistance, gFrameData.mSelfCollisionStiffness) <= 0.0f)
+ return;
+
+ if(threadIdx.x < 3)
+ {
+ float upper = gFrameData.mParticleBounds[threadIdx.x * 2];
+ float negativeLower = gFrameData.mParticleBounds[threadIdx.x * 2 + 1];
+
+ // expand bounds
+ float eps = (upper + negativeLower) * 1e-4f;
+ float expandedUpper = upper + eps;
+ float expandedNegativeLower = negativeLower + eps;
+ float expandedEdgeLength = expandedUpper + expandedNegativeLower;
+
+ float* edgeLength = mPosBias; // use as temp
+ edgeLength[threadIdx.x] = expandedEdgeLength;
+
+ __threadfence_block();
+
+ // calculate shortest axis
+ int32_t shortestAxis = edgeLength[0] > edgeLength[1];
+ if(edgeLength[shortestAxis] > edgeLength[2])
+ shortestAxis = 2;
+
+ uint32_t writeAxis = threadIdx.x - shortestAxis;
+ writeAxis += writeAxis >> 30;
+
+ float maxInvCellSize = __fdividef(127.0f, expandedEdgeLength);
+ float invCollisionDistance = __fdividef(1.0f, gClothData.mSelfCollisionDistance);
+ float invCellSize = min(maxInvCellSize, invCollisionDistance);
+
+ mPosScale[writeAxis] = invCellSize;
+ mPosBias[writeAxis] = invCellSize * expandedNegativeLower;
+ mPosPtr[writeAxis] = generic(current[threadIdx.x]);
+ }
+
+ __syncthreads();
+
+ buildAcceleration(current);
+
+ if(gFrameData.mRestPositions)
+ collideParticles<true>(current);
+ else
+ collideParticles<false>(current);
+}
+
+template <typename CurrentT>
+__device__ void CuSelfCollision::buildAcceleration(const CurrentT& current)
+{
+ int32_t numIndices = gClothData.mNumSelfCollisionIndices;
+ const int32_t* indices = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionIndices);
+ int32_t* sortedKeys = reinterpret_cast<int32_t*>(gClothData.mSelfCollisionKeys);
+ int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart);
+
+ typedef typename CurrentT::ConstPointerType ConstPointerType;
+ ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]);
+ ConstPointerType colPtr = ConstPointerType(mPosPtr[2]);
+
+ float rowScale = mPosScale[1], rowBias = mPosBias[1];
+ float colScale = mPosScale[2], colBias = mPosBias[2];
+
+ // calculate keys
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t index = indices ? indices[i] : i;
+ assert(index < gClothData.mNumParticles);
+
+ int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[index] * rowScale + rowBias, 127.5f)));
+ int32_t colIndex = int32_t(max(0.0f, min(colPtr[index] * colScale + colBias, 127.5f)));
+ assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128);
+
+ int32_t key = (colIndex << 7 | rowIndex) + 129; // + row and column sentinel
+ assert(key <= 0x4080);
+
+ sortedKeys[i] = key << 16 | index; // (key, index) pair in a single int32_t
+ }
+ __syncthreads();
+
+ // get scratch shared mem buffer used for radix sort(histogram)
+ Pointer<Shared, int32_t> buffer =
+ reinterpret_cast<Pointer<Shared, int32_t> const&>(gCollideParticles.get().mCurData.mSphereX);
+
+ // sort keys (__synchthreads inside radix sort)
+ radixSort(sortedKeys, numIndices, buffer);
+
+ // mark cell start if keys are different between neighboring threads
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t key = sortedKeys[i] >> 16;
+ int32_t prevKey = i ? sortedKeys[i - 1] >> 16 : key - 1;
+ if(key != prevKey)
+ {
+ cellStart[key] = i;
+ cellStart[prevKey + 1] = i;
+ }
+ }
+ __syncthreads();
+}
+
+template <bool useRestPositions, typename CurrentT>
+__device__ void CuSelfCollision::collideParticles(CurrentT& current) const
+{
+ const int32_t* sortedKeys = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionKeys);
+ float* sortedParticles = gClothData.mSelfCollisionParticles;
+ int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart);
+
+ const float cdist = gClothData.mSelfCollisionDistance;
+ const float cdistSq = cdist * cdist;
+
+ const int32_t numIndices = gClothData.mNumSelfCollisionIndices;
+ const int32_t numParticles = gClothData.mNumParticles;
+
+ // point to particle copied in device memory that is being updated
+ float* xPtr = sortedParticles;
+ float* yPtr = sortedParticles + numParticles;
+ float* zPtr = sortedParticles + 2 * numParticles;
+ float* wPtr = sortedParticles + 3 * numParticles;
+
+ // copy current particles to temporary array
+ for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x)
+ {
+ xPtr[i] = current(i, 0);
+ yPtr[i] = current(i, 1);
+ zPtr[i] = current(i, 2);
+ wPtr[i] = current(i, 3);
+ }
+ __syncthreads();
+
+ // copy only sorted (indexed) particles to shared mem
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t index = sortedKeys[i] & UINT16_MAX;
+ current(i, 0) = xPtr[index];
+ current(i, 1) = yPtr[index];
+ current(i, 2) = zPtr[index];
+ current(i, 3) = wPtr[index];
+ }
+ __syncthreads();
+
+ typedef typename CurrentT::ConstPointerType ConstPointerType;
+ ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]);
+ ConstPointerType colPtr = ConstPointerType(mPosPtr[2]);
+
+ float rowScale = mPosScale[1], rowBias = mPosBias[1];
+ float colScale = mPosScale[2], colBias = mPosBias[2];
+
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ const int32_t index = sortedKeys[i] & UINT16_MAX;
+ assert(index < gClothData.mNumParticles);
+
+ float restX, restY, restZ;
+ if(useRestPositions)
+ {
+ const float* restIt = gFrameData.mRestPositions + index * 4;
+ restX = restIt[0];
+ restY = restIt[1];
+ restZ = restIt[2];
+ }
+
+ float posX = current(i, 0);
+ float posY = current(i, 1);
+ float posZ = current(i, 2);
+ float posW = current(i, 3);
+
+ float deltaX = 0.0f;
+ float deltaY = 0.0f;
+ float deltaZ = 0.0f;
+ float deltaW = FLT_EPSILON;
+
+ // get cell index for this particle
+ int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[i] * rowScale + rowBias, 127.5f)));
+ int32_t colIndex = int32_t(max(0.0f, min(colPtr[i] * colScale + colBias, 127.5f)));
+ assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128);
+
+ int32_t key = colIndex << 7 | rowIndex;
+ assert(key <= 0x4080);
+
+ // check cells in 3 columns
+ for(int32_t keyEnd = key + 256; key <= keyEnd; key += 128)
+ {
+ // cellStart keys of unoccupied cells have a value of -1
+ uint32_t startIndex; // min<unsigned>(cellStart[key+0..2])
+ uint32_t endIndex; // max<signed>(0, cellStart[key+1..3])
+
+ asm volatile("{\n\t"
+ " .reg .u32 start1, start2;\n\t"
+ " ld.global.s16 %1, [%2+6];\n\t"
+ " ld.global.s16 %0, [%2+0];\n\t"
+ " ld.global.s16 start1, [%2+2];\n\t"
+ " ld.global.s16 start2, [%2+4];\n\t"
+ " max.s32 %1, %1, 0;\n\t"
+ " min.u32 %0, %0, start1;\n\t"
+ " max.s32 %1, %1, start1;\n\t"
+ " min.u32 %0, %0, start2;\n\t"
+ " max.s32 %1, %1, start2;\n\t"
+ "}\n\t"
+ : "=r"(startIndex), "=r"(endIndex)
+ : POINTER_CONSTRAINT(cellStart + key));
+
+ // comparison must be unsigned to skip cells with negative startIndex
+ for(uint32_t j = startIndex; j < endIndex; ++j)
+ {
+ if(j != i) // avoid same particle
+ {
+ float dx = posX - current(j, 0);
+ float dy = posY - current(j, 1);
+ float dz = posZ - current(j, 2);
+
+ float distSqr = dx * dx + dy * dy + dz * dz;
+ if(distSqr > cdistSq)
+ continue;
+
+ if(useRestPositions)
+ {
+ const int32_t jndex = sortedKeys[j] & UINT16_MAX;
+ assert(jndex < gClothData.mNumParticles);
+
+ // calculate distance in rest configuration
+ const float* restJt = gFrameData.mRestPositions + jndex * 4;
+ float rx = restX - restJt[0];
+ float ry = restY - restJt[1];
+ float rz = restZ - restJt[2];
+
+ if(rx * rx + ry * ry + rz * rz <= cdistSq)
+ continue;
+ }
+
+ // premultiply ratio for weighted average
+ float ratio = fmaxf(0.0f, cdist * rsqrtf(FLT_EPSILON + distSqr) - 1.0f);
+ float scale = __fdividef(ratio * ratio, FLT_EPSILON + posW + current(j, 3));
+
+ deltaX += scale * dx;
+ deltaY += scale * dy;
+ deltaZ += scale * dz;
+ deltaW += ratio;
+ }
+ }
+ }
+
+ const float stiffness = gFrameData.mSelfCollisionStiffness * posW;
+ float scale = __fdividef(stiffness, deltaW);
+
+ // apply collision impulse
+ xPtr[index] += deltaX * scale;
+ yPtr[index] += deltaY * scale;
+ zPtr[index] += deltaZ * scale;
+
+ assert(!isnan(xPtr[index] + yPtr[index] + zPtr[index]));
+ }
+ __syncthreads();
+
+ // copy temporary particle array back to shared mem
+ // (need to copy whole array)
+ for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x)
+ {
+ current(i, 0) = xPtr[i];
+ current(i, 1) = yPtr[i];
+ current(i, 2) = zPtr[i];
+ current(i, 3) = wPtr[i];
+ }
+
+ // unmark occupied cells to empty again (faster than clearing all the cells)
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t key = sortedKeys[i] >> 16;
+ cellStart[key] = 0xffff;
+ cellStart[key + 1] = 0xffff;
+ }
+ __syncthreads();
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp
new file mode 100644
index 00000000..68238664
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp
@@ -0,0 +1,556 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "CuSolver.h"
+#include "CuCloth.h"
+#include "ClothImpl.h"
+#include "CuFabric.h"
+#include "CuFactory.h"
+#include "CuSolverKernel.h"
+#include "CuContextLock.h"
+#include "CuCheckSuccess.h"
+#include "IterationState.h"
+#include "CudaKernelWrangler.h"
+#include "PsUtilities.h"
+#include "PsSort.h"
+#include "PsFoundation.h"
+
+#if PX_NVTX
+#include "nvToolsExt.h"
+#endif
+
+//#define ENABLE_CUDA_PRINTF PX_DEBUG // warning: not thread safe
+#define ENABLE_CUDA_PRINTF 0
+
+#if ENABLE_CUDA_PRINTF
+extern "C" cudaError_t cudaPrintfInit(CUmodule hmod, size_t bufferLen = 1048576);
+extern "C" void cudaPrintfEnd();
+extern "C" cudaError_t cudaPrintfDisplay(CUmodule hmod, void* outputFP = NULL, bool showThreadID = false);
+#endif
+
+using namespace physx;
+
+namespace
+{
+//for KernelWrangler interface
+const char* gKernelName = cloth::getKernelFunctionName();
+}
+
+namespace
+{
+template <typename T>
+struct CuDeviceAllocator
+{
+ CuDeviceAllocator(physx::PxCudaContextManager* ctx) : mManager(ctx->getMemoryManager())
+ {
+ }
+
+ T* allocate(size_t n)
+ {
+ return reinterpret_cast<T*>(mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, n * sizeof(T)));
+ }
+
+ void deallocate(T* ptr)
+ {
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, reinterpret_cast<physx::PxCudaBufferPtr>(ptr));
+ }
+
+ physx::PxCudaMemoryManager* mManager;
+};
+}
+
+cloth::CuSolver::CuSolver(CuFactory& factory)
+: CuContextLock(factory)
+, mFactory(factory)
+, mClothData(mFactory.mContextManager)
+, mClothDataHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocWriteCombined))
+, mClothDataDirty(false)
+, mFrameData(getMappedAllocator<CuFrameData>(mFactory.mContextManager))
+, mIterationData(getMappedAllocator<CuIterationData>(mFactory.mContextManager))
+, mIterationDataBegin(0)
+, mFrameDt(0.0f)
+, mSharedMemorySize(0)
+, mSharedMemoryLimit(0)
+, mStartSimulationTask(&CuSolver::beginFrame, "cloth.CuSolver.startSimulation")
+, mKernelSimulationTask(&CuSolver::executeKernel, "cloth.CuSolver.kernelSimulation")
+, mEndSimulationTask(&CuSolver::endFrame, "cloth.CuSolver.endSimulation")
+, mStream(0)
+, mKernelModule(0)
+, mKernelFunction(0)
+, mKernelSharedMemorySize(0)
+, mClothIndex(CuDeviceAllocator<uint32_t>(mFactory.mContextManager).allocate(1))
+, mInterCollisionDistance(0.0f)
+, mInterCollisionStiffness(1.0f)
+, mInterCollisionIterations(1)
+, mInterCollisionScratchMem(NULL)
+, mInterCollisionScratchMemSize(0)
+, mKernelWrangler(getDispatcher(), physx::shdfnd::getFoundation().getErrorCallback(), &gKernelName, 1)
+, mSimulateNvtxRangeId(0)
+, mCudaError(mKernelWrangler.hadError())
+{
+ if(mCudaError)
+ {
+ CuContextLock::release();
+ return;
+ }
+
+ mStartSimulationTask.mSolver = this;
+ mKernelSimulationTask.mSolver = this;
+ mEndSimulationTask.mSolver = this;
+
+ if(mFactory.mContextManager->getUsingConcurrentStreams())
+ checkSuccess(cuStreamCreate(&mStream, 0));
+
+ if(1)
+ {
+ mKernelModule = mKernelWrangler.getCuModule(0);
+ mKernelFunction = mKernelWrangler.getCuFunction(0);
+ }
+ else
+ {
+ // load from ptx instead of embedded SASS, for iterating without recompile
+ checkSuccess(cuModuleLoad(&mKernelModule, "CuSolverKernel.ptx"));
+ checkSuccess(cuModuleGetFunction(&mKernelFunction, mKernelModule, getKernelFunctionName()));
+ shdfnd::getFoundation().error(PX_INFO, "Cloth kernel code loaded from CuSolverKernel.ptx");
+ }
+
+ // get amount of statically allocated shared memory
+ checkSuccess(cuFuncGetAttribute(&mKernelSharedMemorySize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, mKernelFunction));
+
+ // extract CuKernelData device pointer
+ size_t size = 0;
+ CUdeviceptr ptr = 0;
+ checkSuccess(cuModuleGetGlobal(&ptr, &size, mKernelModule, getKernelDataName()));
+ mKernelData = CuDevicePointer<CuKernelData>(reinterpret_cast<CuKernelData*>(ptr));
+
+ // initialize cloth index
+ checkSuccess(cuMemsetD32(mClothIndex.dev(), 0, 1));
+
+ CuContextLock::release();
+}
+
+cloth::CuSolver::~CuSolver()
+{
+ PX_ASSERT(mCloths.empty());
+
+ CuContextLock::acquire();
+
+ CuKernelData kernelData = {};
+ *mKernelData = kernelData;
+
+ CuDeviceAllocator<uint32_t>(mFactory.mContextManager).deallocate(mClothIndex.get());
+
+ if(mStream)
+ checkSuccess(cuStreamDestroy(mStream));
+
+ if(mInterCollisionScratchMem)
+ PX_FREE(mInterCollisionScratchMem);
+}
+
+void cloth::CuSolver::updateKernelData()
+{
+ CuKernelData kernelData;
+
+ kernelData.mClothIndex = mClothIndex.get();
+ kernelData.mClothData = mClothData.begin().get();
+ kernelData.mFrameData = getDevicePointer(mFrameData);
+
+ *mKernelData = kernelData;
+}
+
+physx::PxGpuDispatcher& cloth::CuSolver::getDispatcher() const
+{
+ return *mFactory.mContextManager->getGpuDispatcher();
+}
+
+namespace
+{
+struct ClothSimCostGreater
+{
+ bool operator()(const cloth::CuCloth* left, const cloth::CuCloth* right) const
+ {
+ return left->mNumParticles * left->mSolverFrequency > right->mNumParticles * right->mSolverFrequency;
+ }
+};
+}
+
+void cloth::CuSolver::addCloth(Cloth* cloth)
+{
+ CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth;
+
+ PX_ASSERT(mCloths.find(&cuCloth) == mCloths.end());
+
+ mCloths.pushBack(&cuCloth);
+ // trigger update of mClothData array
+ cuCloth.notifyChanged();
+
+ // sort cloth instances by size
+ shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater());
+
+ CuContextLock contextLock(mFactory);
+
+ // resize containers and update kernel data
+ mClothDataHostCopy.resize(mCloths.size());
+ mClothData.resize(mCloths.size());
+ mFrameData.resize(mCloths.size());
+ updateKernelData();
+}
+
+void cloth::CuSolver::removeCloth(Cloth* cloth)
+{
+ CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth;
+
+ ClothVector::Iterator begin = mCloths.begin(), end = mCloths.end();
+ ClothVector::Iterator it = mCloths.find(&cuCloth);
+
+ if(it == end)
+ return; // not found
+
+ uint32_t index = uint32_t(it - begin);
+
+ mCloths.remove(index);
+ mClothDataHostCopy.remove(index);
+ mClothData.resize(mCloths.size());
+ mClothDataDirty = true;
+}
+
+physx::PxBaseTask& cloth::CuSolver::simulate(float dt, physx::PxBaseTask& continuation)
+{
+ mFrameDt = dt;
+
+ if(mCloths.empty() || mCudaError)
+ {
+ continuation.addReference();
+ return continuation;
+ }
+
+ physx::PxGpuDispatcher& disp = getDispatcher();
+ mEndSimulationTask.setContinuation(&continuation);
+ disp.addPostLaunchDependent(mEndSimulationTask);
+ mKernelSimulationTask.setContinuation(&disp.getPostLaunchTask());
+ disp.getPostLaunchTask().removeReference();
+ disp.addPreLaunchDependent(mKernelSimulationTask);
+ mStartSimulationTask.setContinuation(&disp.getPreLaunchTask());
+ disp.getPreLaunchTask().removeReference();
+
+ mEndSimulationTask.removeReference();
+ mKernelSimulationTask.removeReference();
+
+ return mStartSimulationTask;
+}
+
+void cloth::CuSolver::beginFrame()
+{
+ CuContextLock contextLock(mFactory);
+
+ PX_PROFILE_START_CROSSTHREAD("cloth.CuSolver.simulate", 0);
+
+ CuIterationData* iterationDataBegin = mIterationData.empty() ? 0 : &mIterationData.front();
+
+ mFrameData.resize(0);
+ mIterationData.resize(0);
+
+ // update cloth data
+ ClothVector::Iterator cIt, cEnd = mCloths.end();
+ CuPinnedVector<CuClothData>::Type::Iterator dIt = mClothDataHostCopy.begin();
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++dIt)
+ mClothDataDirty |= (*cIt)->updateClothData(*dIt);
+
+ if(mClothDataDirty)
+ {
+ /* find optimal number of cloths per SM */
+
+ // at least 192 threads per block (e.g. CuCollision::buildAcceleration)
+ uint32_t numSMs = (uint32_t)mFactory.mContextManager->getMultiprocessorCount();
+ uint32_t maxClothsPerSM = PxMin(mFactory.mMaxThreadsPerBlock / 192, (mCloths.size() + numSMs - 1) / numSMs);
+
+ // tuning parameters: relative performance per numSharedPositions
+ float weights[3] = { 0.4f, 0.8f, 1.0f };
+
+ // try all possible number of cloths per SM and estimate performance
+ float maxWeightSum = 0.0f;
+ uint32_t numClothsPerSM = 0;
+ for(uint32_t i = 1; i <= maxClothsPerSM; ++i)
+ {
+ uint32_t sharedMemoryLimit = (mFactory.mContextManager->getSharedMemPerBlock() / i) - mKernelSharedMemorySize;
+
+ float weightSum = 0.0f;
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt)
+ {
+ uint32_t sharedMemorySize = (*cIt)->mSharedMemorySize;
+ uint32_t positionsSize = (*cIt)->mNumParticles * sizeof(PxVec4);
+
+ if(sharedMemorySize > sharedMemoryLimit)
+ break;
+
+ uint32_t numSharedPositions = PxMin(2u, (sharedMemoryLimit - sharedMemorySize) / positionsSize);
+
+ weightSum += weights[numSharedPositions] * positionsSize;
+ }
+ // tuning parameter: inverse performance for running i cloths per SM
+ weightSum *= 2.0f + i;
+
+ if(cIt == cEnd && weightSum > maxWeightSum)
+ {
+ maxWeightSum = weightSum;
+ numClothsPerSM = i;
+ }
+ }
+ PX_ASSERT(numClothsPerSM);
+
+ // update block size
+ uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31;
+
+ // Workaround for nvbug 1709919: theoretically, register usage should allow us to launch at least
+ // mFactory.mMaxThreadsPerBlock threads, because that value corresponds to __launch_bounds__(maxThreadsPerBlock).
+ CUdevice device = 0;
+ checkSuccess(cuCtxGetDevice(&device));
+ int registersPerBlock = 0, kernelRegisterCount = 0;
+ checkSuccess(cuDeviceGetAttribute(&registersPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
+ checkSuccess(cuFuncGetAttribute(&kernelRegisterCount, CU_FUNC_ATTRIBUTE_NUM_REGS, mKernelFunction));
+ numThreadsPerBlock = PxMin(numThreadsPerBlock, uint32_t(registersPerBlock / kernelRegisterCount));
+ PX_ASSERT(numThreadsPerBlock >= 192);
+
+ if(mFactory.mNumThreadsPerBlock != numThreadsPerBlock)
+ {
+ checkSuccess(
+ cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1));
+ }
+
+ // remember num cloths per SM in terms of max shared memory per block
+ mSharedMemoryLimit =
+ (mFactory.mContextManager->getSharedMemPerBlock() / numClothsPerSM) - mKernelSharedMemorySize;
+ }
+
+ uint32_t maxSharedMemorySize = 0;
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt)
+ {
+ CuCloth& cloth = **cIt;
+
+ uint32_t sharedMemorySize = cloth.mSharedMemorySize;
+ uint32_t positionsSize = cloth.mNumParticles * sizeof(PxVec4);
+
+ uint32_t numSharedPositions = PxMin(2u, (mSharedMemoryLimit - sharedMemorySize) / positionsSize);
+
+ maxSharedMemorySize = PxMax(maxSharedMemorySize, sharedMemorySize + numSharedPositions * positionsSize);
+
+ IterationStateFactory factory(cloth, mFrameDt);
+ IterationState<Simd4f> state = factory.create<Simd4f>(cloth);
+
+ mFrameData.pushBack(CuFrameData(cloth, numSharedPositions, state, mIterationDataBegin + mIterationData.size()));
+
+ while(state.mRemainingIterations)
+ {
+ mIterationData.pushBack(CuIterationData(state));
+ state.update();
+ }
+ }
+ mSharedMemorySize = maxSharedMemorySize;
+
+ // add dummy element because we read past the end
+ mIterationData.pushBack(CuIterationData());
+
+ if(&mIterationData.front() != iterationDataBegin)
+ {
+ // mIterationData grew, update pointers
+ iterationDataBegin = getDevicePointer(mIterationData);
+
+ ptrdiff_t diff = (char*)iterationDataBegin - (char*)mIterationDataBegin;
+ CuPinnedVector<CuFrameData>::Type::Iterator fIt = mFrameData.begin(), fEnd;
+ for(fEnd = mFrameData.end(); fIt != fEnd; ++fIt)
+ reinterpret_cast<const char*&>(fIt->mIterationData) += diff;
+
+ mIterationDataBegin = iterationDataBegin;
+ }
+}
+
+void cloth::CuSolver::executeKernel()
+{
+ CuContextLock contextLock(mFactory);
+
+#if ENABLE_CUDA_PRINTF
+ if(cudaError result = cudaPrintfInit(mKernelModule))
+ {
+ shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__, "cudaPrintfInit() returned %u.",
+ result);
+ }
+#endif
+
+ if(mClothDataDirty)
+ {
+ PX_ASSERT(mClothDataHostCopy.size() == mClothData.size());
+ size_t numBytes = mClothData.size() * sizeof(CuClothData);
+ checkSuccess(cuMemcpyHtoDAsync(mClothData.begin().dev(), mClothDataHostCopy.begin(), numBytes, mStream));
+ mClothDataDirty = false;
+ }
+
+#if 0
+ static int frame = 0;
+ if(++frame == 100)
+ record(*this);
+#endif
+
+ // launch kernel
+ CUresult result = cuLaunchKernel(mKernelFunction, mCloths.size(), 1, 1, mFactory.mNumThreadsPerBlock, 1, 1,
+ mSharedMemorySize, mStream, 0, 0);
+
+#if ENABLE_CUDA_PRINTF
+ cudaPrintfDisplay(mKernelModule);
+ cudaPrintfEnd();
+#endif
+
+#if PX_DEBUG
+ // in debug builds check kernel result
+ checkSuccess(result);
+ checkSuccess(cuStreamSynchronize(mStream));
+#endif
+
+ // mark the solver as being in an error state
+ // all cloth instances will be migrated to software
+ if(result != CUDA_SUCCESS)
+ mCudaError = true;
+}
+
+void cloth::CuSolver::endFrame()
+{
+ CuPinnedVector<CuFrameData>::Type::ConstIterator fIt = mFrameData.begin();
+ ClothVector::Iterator cIt, cEnd = mCloths.end();
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++fIt)
+ {
+ CuCloth& cloth = **cIt;
+
+ cloth.mHostParticlesDirty = false;
+ cloth.mDeviceParticlesDirty = false;
+
+ cloth.mMotionConstraints.pop();
+ cloth.mMotionConstraints.mHostCopy.resize(0);
+
+ cloth.mSeparationConstraints.pop();
+ cloth.mSeparationConstraints.mHostCopy.resize(0);
+
+ if(!cloth.mTargetCollisionSpheres.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionSpheres, cloth.mTargetCollisionSpheres);
+ cloth.mTargetCollisionSpheres.resize(0);
+ }
+
+ if(!cloth.mTargetCollisionPlanes.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionPlanes, cloth.mTargetCollisionPlanes);
+ cloth.mTargetCollisionPlanes.resize(0);
+ }
+
+ if(!cloth.mTargetCollisionTriangles.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionTriangles, cloth.mTargetCollisionTriangles);
+ cloth.mTargetCollisionTriangles.resize(0);
+ }
+
+ for(uint32_t i = 0; i < 3; ++i)
+ {
+ float upper = fIt->mParticleBounds[i * 2 + 0];
+ float negativeLower = fIt->mParticleBounds[i * 2 + 1];
+ cloth.mParticleBoundsCenter[i] = (upper - negativeLower) * 0.5f;
+ cloth.mParticleBoundsHalfExtent[i] = (upper + negativeLower) * 0.5f;
+ }
+
+ cloth.mSleepPassCounter = fIt->mSleepPassCounter;
+ cloth.mSleepTestCounter = fIt->mSleepTestCounter;
+ }
+
+ interCollision();
+
+ PX_PROFILE_STOP_CROSSTHREAD("cloth::CuSolver::simulate", 0);
+}
+
+void cloth::CuSolver::interCollision()
+{
+ if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f)
+ return;
+
+ typedef SwInterCollision<Simd4f> SwInterCollision;
+
+ // rebuild cloth instance array
+ mInterCollisionInstances.resize(0);
+ for(uint32_t i = 0, n = mCloths.size(); i < n; ++i)
+ {
+ CuCloth& cloth = *mCloths[i];
+
+ float elasticity = 1.0f / mFrameData[i].mNumIterations;
+ PX_ASSERT(!cloth.mHostParticlesDirty);
+ PxVec4* particles = cloth.mParticlesHostCopy.begin();
+ uint32_t* indices = NULL, numIndices = cloth.mNumParticles;
+ if(!cloth.mSelfCollisionIndices.empty())
+ {
+ indices = cloth.mSelfCollisionIndicesHost.begin();
+ numIndices = uint32_t(cloth.mSelfCollisionIndices.size());
+ }
+
+ mInterCollisionInstances.pushBack(SwInterCollisionData(
+ particles, particles + cloth.mNumParticles, numIndices, indices, cloth.mTargetMotion,
+ cloth.mParticleBoundsCenter, cloth.mParticleBoundsHalfExtent, elasticity, cloth.mUserData));
+
+ cloth.mDeviceParticlesDirty = true;
+ }
+
+ uint32_t requiredTempMemorySize = uint32_t(
+ SwInterCollision::estimateTemporaryMemory(&mInterCollisionInstances[0], mInterCollisionInstances.size()));
+
+ // realloc temp memory if necessary
+ if(mInterCollisionScratchMemSize < requiredTempMemorySize)
+ {
+ if(mInterCollisionScratchMem)
+ PX_FREE(mInterCollisionScratchMem);
+
+ mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem");
+ mInterCollisionScratchMemSize = requiredTempMemorySize;
+ }
+
+ SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize);
+
+ // run inter-collision
+ SwInterCollision(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), mInterCollisionDistance,
+ mInterCollisionStiffness, mInterCollisionIterations, mInterCollisionFilter, allocator)();
+}
+
+cloth::CuSolver::ClothSolverTask::ClothSolverTask(FunctionPtr functionPtr, const char* name)
+: mSolver(0), mFunctionPtr(functionPtr), mName(name)
+{
+}
+
+void cloth::CuSolver::ClothSolverTask::runInternal()
+{
+ (mSolver->*mFunctionPtr)();
+}
+
+const char* cloth::CuSolver::ClothSolverTask::getName() const
+{
+ return mName;
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h
new file mode 100644
index 00000000..ff98d975
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h
@@ -0,0 +1,180 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Solver.h"
+#include "CuClothData.h"
+#include "CuPinnedAllocator.h"
+#include "CuContextLock.h"
+#include "CuDeviceVector.h"
+#include "CudaKernelWrangler.h"
+#include "CmTask.h"
+
+#include "SwInterCollision.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class CuCloth;
+class CuFabric;
+struct PhaseConfig;
+struct CuKernelData;
+
+class CuSolver : public UserAllocated, private CuContextLock, public Solver
+{
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4371) // layout of class may have changed from a previous version of the compiler due to
+ // better packing of member
+#endif
+ struct ClothSolverTask : public Cm::Task
+ {
+ typedef void (CuSolver::*FunctionPtr)();
+
+ ClothSolverTask(FunctionPtr, const char*);
+ virtual void runInternal();
+ virtual const char* getName() const;
+
+ CuSolver* mSolver;
+ FunctionPtr mFunctionPtr;
+ const char* mName;
+ };
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+ PX_NOCOPY(CuSolver)
+ public:
+ CuSolver(CuFactory&);
+ ~CuSolver();
+
+ virtual void addCloth(Cloth*);
+ virtual void removeCloth(Cloth*);
+
+ virtual physx::PxBaseTask& simulate(float dt, physx::PxBaseTask&);
+
+ virtual bool hasError() const
+ {
+ return mCudaError;
+ }
+
+ virtual void setInterCollisionDistance(float distance)
+ {
+ mInterCollisionDistance = distance;
+ }
+ virtual float getInterCollisionDistance() const
+ {
+ return mInterCollisionDistance;
+ }
+ virtual void setInterCollisionStiffness(float stiffness)
+ {
+ mInterCollisionStiffness = stiffness;
+ }
+ virtual float getInterCollisionStiffness() const
+ {
+ return mInterCollisionStiffness;
+ }
+ virtual void setInterCollisionNbIterations(uint32_t nbIterations)
+ {
+ mInterCollisionIterations = nbIterations;
+ }
+ virtual uint32_t getInterCollisionNbIterations() const
+ {
+ return mInterCollisionIterations;
+ }
+ virtual void setInterCollisionFilter(InterCollisionFilter filter)
+ {
+ mInterCollisionFilter = filter;
+ }
+
+ private:
+ void updateKernelData(); // context needs to be acquired
+
+ // simulate helper functions
+ void beginFrame();
+ void executeKernel();
+ void endFrame();
+
+ void interCollision();
+
+ physx::PxGpuDispatcher& getDispatcher() const;
+
+ private:
+ CuFactory& mFactory;
+
+ typedef Vector<CuCloth*>::Type ClothVector;
+ ClothVector mCloths;
+
+ CuDeviceVector<CuClothData> mClothData;
+ CuPinnedVector<CuClothData>::Type mClothDataHostCopy;
+ bool mClothDataDirty;
+
+ CuPinnedVector<CuFrameData>::Type mFrameData;
+
+ CuPinnedVector<CuIterationData>::Type mIterationData;
+ CuIterationData* mIterationDataBegin; // corresponding device ptr
+
+ float mFrameDt;
+
+ uint32_t mSharedMemorySize;
+ uint32_t mSharedMemoryLimit;
+
+ ClothSolverTask mStartSimulationTask;
+ ClothSolverTask mKernelSimulationTask;
+ ClothSolverTask mEndSimulationTask;
+
+ CUstream mStream;
+ CUmodule mKernelModule;
+ CUfunction mKernelFunction;
+ int mKernelSharedMemorySize;
+ CuDevicePointer<CuKernelData> mKernelData;
+ CuDevicePointer<uint32_t> mClothIndex;
+
+ float mInterCollisionDistance;
+ float mInterCollisionStiffness;
+ uint32_t mInterCollisionIterations;
+ InterCollisionFilter mInterCollisionFilter;
+ void* mInterCollisionScratchMem;
+ uint32_t mInterCollisionScratchMemSize;
+ shdfnd::Array<SwInterCollisionData> mInterCollisionInstances;
+
+ physx::KernelWrangler mKernelWrangler;
+
+ uint64_t mSimulateNvtxRangeId;
+
+ bool mCudaError;
+
+ friend void record(const CuSolver&);
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h
new file mode 100644
index 00000000..d6ca350f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h
@@ -0,0 +1,57 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+namespace cloth
+{
+struct CuClothData;
+struct CuFrameData;
+
+// data of all cloth instances, one block per instance
+struct CuKernelData
+{
+ // pointer to atomic variable
+ uint32_t* mClothIndex;
+
+ // array of cloths (length determined by grid dim)
+ const CuClothData* mClothData;
+
+ // frame data per cloth
+ CuFrameData* mFrameData;
+};
+
+const char* getKernelDataName();
+const char* getKernelFunctionName();
+}
+}