aboutsummaryrefslogtreecommitdiff
path: root/PhysX_3.4/Source/LowLevelCloth/src
diff options
context:
space:
mode:
authorgit perforce import user <a@b>2016-10-25 12:29:14 -0600
committerSheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>2016-10-25 18:56:37 -0500
commit3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
treefa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelCloth/src
downloadphysx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
Initial commit:
PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src')
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/Allocator.cpp46
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/Allocator.h74
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/Array.h69
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/BoundingBox.h103
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/ClothBase.h133
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/ClothImpl.h1302
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/Factory.cpp71
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/IndexPair.h46
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/IterationState.h403
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/MovingAverage.h145
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/PhaseConfig.cpp75
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/PointInterpolator.h168
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/Simd.h43
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/StackAllocator.h155
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwCloth.cpp305
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwCloth.h210
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwClothData.cpp154
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwClothData.h151
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp1935
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwCollision.h138
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwCollisionHelpers.h84
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwFabric.cpp177
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwFabric.h109
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwFactory.cpp297
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwFactory.h90
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.cpp714
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.h144
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.cpp426
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.h83
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwSolver.cpp294
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwSolver.h153
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.cpp781
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.h84
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.cpp246
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.h56
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/Vec4T.h104
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp932
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/neon/NeonCollision.cpp34
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSelfCollision.cpp34
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSolverKernel.cpp49
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/neon/SwCollisionHelpers.h87
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/scalar/SwCollisionHelpers.h92
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h92
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h392
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h225
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h45
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp511
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h216
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp83
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp238
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h191
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h1505
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp54
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h57
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h216
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h258
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp197
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h102
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp398
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h107
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h51
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h132
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h472
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp556
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h180
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h57
66 files changed, 17131 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Allocator.cpp b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.cpp
new file mode 100644
index 00000000..7a322ce9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.cpp
@@ -0,0 +1,46 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "Allocator.h"
+#include "PsAlignedMalloc.h"
+
+namespace physx
+{
+
+void* cloth::allocate(size_t n)
+{
+ return n ? physx::shdfnd::getAllocator().allocate(n, "", __FILE__, __LINE__) : 0;
+}
+
+void cloth::deallocate(void* ptr)
+{
+ if(ptr)
+ physx::shdfnd::getAllocator().deallocate(ptr);
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Allocator.h b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.h
new file mode 100644
index 00000000..d99c1708
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.h
@@ -0,0 +1,74 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "PsArray.h"
+#include "PsAlignedMalloc.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+void* allocate(size_t);
+void deallocate(void*);
+
+/* templated typedefs for convenience */
+
+template <typename T>
+struct Vector
+{
+ typedef physx::shdfnd::Array<T, physx::shdfnd::NonTrackingAllocator> Type;
+};
+
+template <typename T, size_t alignment>
+struct AlignedVector
+{
+ typedef physx::shdfnd::Array<T, physx::shdfnd::AlignedAllocator<alignment> > Type;
+};
+
+struct UserAllocated
+{
+ virtual ~UserAllocated()
+ {
+ }
+ static void* operator new(size_t n)
+ {
+ return allocate(n);
+ }
+ static void operator delete(void* ptr)
+ {
+ deallocate(ptr);
+ }
+};
+
+} // namespace cloth
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Array.h b/PhysX_3.4/Source/LowLevelCloth/src/Array.h
new file mode 100644
index 00000000..75ba2f50
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/Array.h
@@ -0,0 +1,69 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxVec4.h"
+#include "foundation/PxQuat.h"
+#include "foundation/PxVec3.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+inline PxReal (&array(PxVec3& v))[3]
+{
+ return reinterpret_cast<PxReal(&)[3]>(v);
+}
+inline const PxReal (&array(const PxVec3& v))[3]
+{
+ return reinterpret_cast<const PxReal(&)[3]>(v);
+}
+inline PxReal (&array(PxVec4& v))[4]
+{
+ return reinterpret_cast<PxReal(&)[4]>(v);
+}
+inline const PxReal (&array(const PxVec4& v))[4]
+{
+ return reinterpret_cast<const PxReal(&)[4]>(v);
+}
+inline PxReal (&array(PxQuat& q))[4]
+{
+ return reinterpret_cast<PxReal(&)[4]>(q);
+}
+inline const PxReal (&array(const PxQuat& q))[4]
+{
+ return reinterpret_cast<const PxReal(&)[4]>(q);
+}
+
+} // namespace cloth
+
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/BoundingBox.h b/PhysX_3.4/Source/LowLevelCloth/src/BoundingBox.h
new file mode 100644
index 00000000..bd33affa
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/BoundingBox.h
@@ -0,0 +1,103 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Simd.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+template <typename Simd4f>
+struct BoundingBox
+{
+ Simd4f mLower;
+ Simd4f mUpper;
+};
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> loadBounds(const float* ptr)
+{
+ BoundingBox<Simd4f> result;
+ result.mLower = load(ptr);
+ result.mUpper = load(ptr + 3);
+ return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> emptyBounds()
+{
+ BoundingBox<Simd4f> result;
+
+ result.mLower = gSimd4fFloatMax;
+ result.mUpper = -result.mLower;
+
+ return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd)
+{
+ BoundingBox<Simd4f> result = bounds;
+ for(; pIt != pEnd; ++pIt)
+ {
+ result.mLower = min(result.mLower, *pIt);
+ result.mUpper = max(result.mUpper, *pIt);
+ }
+ return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+{
+ BoundingBox<Simd4f> result;
+ result.mLower = min(a.mLower, b.mLower);
+ result.mUpper = max(a.mUpper, b.mUpper);
+ return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+{
+ BoundingBox<Simd4f> result;
+ result.mLower = max(a.mLower, b.mLower);
+ result.mUpper = min(a.mUpper, b.mUpper);
+ return result;
+}
+
+template <typename Simd4f>
+inline bool isEmptyBounds(const BoundingBox<Simd4f>& a)
+{
+ return anyGreater(a.mLower, a.mUpper) != 0;
+}
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/ClothBase.h b/PhysX_3.4/Source/LowLevelCloth/src/ClothBase.h
new file mode 100644
index 00000000..fef5090e
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/ClothBase.h
@@ -0,0 +1,133 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "PsMathUtils.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+/* helper functions shared between SwCloth and CuCloth */
+
+template <typename Cloth>
+void initialize(Cloth& cloth, const PxVec4* pIt, const PxVec4* pEnd)
+{
+ // initialize particles bounding box
+ PxVec4 lower(FLT_MAX), upper = -lower;
+ for(; pIt != pEnd; ++pIt)
+ {
+ lower = lower.minimum(*pIt);
+ upper = upper.maximum(*pIt);
+ }
+ PxVec4 center = (upper + lower) * 0.5f;
+ PxVec4 extent = (upper - lower) * 0.5f;
+ cloth.mParticleBoundsCenter = reinterpret_cast<const PxVec3&>(center);
+ cloth.mParticleBoundsHalfExtent = reinterpret_cast<const PxVec3&>(extent);
+
+ cloth.mGravity = PxVec3(0.0f);
+ cloth.mLogDamping = PxVec3(0.0f);
+ cloth.mLinearLogDrag = PxVec3(0.0f);
+ cloth.mAngularLogDrag = PxVec3(0.0f);
+ cloth.mLinearInertia = PxVec3(1.0f);
+ cloth.mAngularInertia = PxVec3(1.0f);
+ cloth.mCentrifugalInertia = PxVec3(1.0f);
+ cloth.mSolverFrequency = 60.0f;
+ cloth.mStiffnessFrequency = 10.0f;
+ cloth.mTargetMotion = PxTransform(PxIdentity);
+ cloth.mCurrentMotion = PxTransform(PxIdentity);
+ cloth.mLinearVelocity = PxVec3(0.0f);
+ cloth.mAngularVelocity = PxVec3(0.0f);
+ cloth.mPrevIterDt = 0.0f;
+ cloth.mIterDtAvg = MovingAverage(30);
+ cloth.mTetherConstraintLogStiffness = PxReal(-FLT_MAX_EXP);
+ cloth.mTetherConstraintScale = 1.0f;
+ cloth.mMotionConstraintScale = 1.0f;
+ cloth.mMotionConstraintBias = 0.0f;
+ cloth.mMotionConstraintLogStiffness = PxReal(-FLT_MAX_EXP);
+ cloth.mWind = PxVec3(0.0f);
+ cloth.mDragLogCoefficient = 0.0f;
+ cloth.mLiftLogCoefficient = 0.0f;
+ cloth.mEnableContinuousCollision = false;
+ cloth.mCollisionMassScale = 0.0f;
+ cloth.mFriction = 0.0f;
+ cloth.mSelfCollisionDistance = 0.0f;
+ cloth.mSelfCollisionLogStiffness = PxReal(-FLT_MAX_EXP);
+ cloth.mSleepTestInterval = uint32_t(-1);
+ cloth.mSleepAfterCount = uint32_t(-1);
+ cloth.mSleepThreshold = 0.0f;
+ cloth.mSleepPassCounter = 0;
+ cloth.mSleepTestCounter = 0;
+}
+
+template <typename DstCloth, typename SrcCloth>
+void copy(DstCloth& dstCloth, const SrcCloth& srcCloth)
+{
+ dstCloth.mParticleBoundsCenter = srcCloth.mParticleBoundsCenter;
+ dstCloth.mParticleBoundsHalfExtent = srcCloth.mParticleBoundsHalfExtent;
+ dstCloth.mGravity = srcCloth.mGravity;
+ dstCloth.mLogDamping = srcCloth.mLogDamping;
+ dstCloth.mLinearLogDrag = srcCloth.mLinearLogDrag;
+ dstCloth.mAngularLogDrag = srcCloth.mAngularLogDrag;
+ dstCloth.mLinearInertia = srcCloth.mLinearInertia;
+ dstCloth.mAngularInertia = srcCloth.mAngularInertia;
+ dstCloth.mCentrifugalInertia = srcCloth.mCentrifugalInertia;
+ dstCloth.mSolverFrequency = srcCloth.mSolverFrequency;
+ dstCloth.mStiffnessFrequency = srcCloth.mStiffnessFrequency;
+ dstCloth.mTargetMotion = srcCloth.mTargetMotion;
+ dstCloth.mCurrentMotion = srcCloth.mCurrentMotion;
+ dstCloth.mLinearVelocity = srcCloth.mLinearVelocity;
+ dstCloth.mAngularVelocity = srcCloth.mAngularVelocity;
+ dstCloth.mPrevIterDt = srcCloth.mPrevIterDt;
+ dstCloth.mIterDtAvg = srcCloth.mIterDtAvg;
+ dstCloth.mTetherConstraintLogStiffness = srcCloth.mTetherConstraintLogStiffness;
+ dstCloth.mTetherConstraintScale = srcCloth.mTetherConstraintScale;
+ dstCloth.mMotionConstraintScale = srcCloth.mMotionConstraintScale;
+ dstCloth.mMotionConstraintBias = srcCloth.mMotionConstraintBias;
+ dstCloth.mMotionConstraintLogStiffness = srcCloth.mMotionConstraintLogStiffness;
+ dstCloth.mWind = srcCloth.mWind;
+ dstCloth.mDragLogCoefficient = srcCloth.mDragLogCoefficient;
+ dstCloth.mLiftLogCoefficient = srcCloth.mLiftLogCoefficient;
+ dstCloth.mEnableContinuousCollision = srcCloth.mEnableContinuousCollision;
+ dstCloth.mCollisionMassScale = srcCloth.mCollisionMassScale;
+ dstCloth.mFriction = srcCloth.mFriction;
+ dstCloth.mSelfCollisionDistance = srcCloth.mSelfCollisionDistance;
+ dstCloth.mSelfCollisionLogStiffness = srcCloth.mSelfCollisionLogStiffness;
+ dstCloth.mSleepTestInterval = srcCloth.mSleepTestInterval;
+ dstCloth.mSleepAfterCount = srcCloth.mSleepAfterCount;
+ dstCloth.mSleepThreshold = srcCloth.mSleepThreshold;
+ dstCloth.mSleepPassCounter = srcCloth.mSleepPassCounter;
+ dstCloth.mSleepTestCounter = srcCloth.mSleepTestCounter;
+ dstCloth.mUserData = srcCloth.mUserData;
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/ClothImpl.h b/PhysX_3.4/Source/LowLevelCloth/src/ClothImpl.h
new file mode 100644
index 00000000..2cc491c5
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/ClothImpl.h
@@ -0,0 +1,1302 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Cloth.h"
+#include "Fabric.h"
+#include "Allocator.h"
+#include "PsMathUtils.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+// SwCloth or CuCloth aggregate implementing the Cloth interface
+// Member specializations are implemented in Sw/CuCloth.cpp
+template <typename T>
+class ClothImpl : public UserAllocated, public Cloth
+{
+ ClothImpl(const ClothImpl&);
+
+ public:
+ ClothImpl& operator=(const ClothImpl&);
+
+ typedef T ClothType;
+ typedef typename ClothType::FactoryType FactoryType;
+ typedef typename ClothType::FabricType FabricType;
+ typedef typename ClothType::ContextLockType ContextLockType;
+
+ ClothImpl(Factory&, Fabric&, Range<const PxVec4>);
+ ClothImpl(Factory&, const ClothImpl&);
+
+ virtual Cloth* clone(Factory& factory) const;
+
+ virtual Fabric& getFabric() const;
+ virtual Factory& getFactory() const;
+
+ virtual uint32_t getNumParticles() const;
+ virtual void lockParticles() const;
+ virtual void unlockParticles() const;
+ virtual MappedRange<PxVec4> getCurrentParticles();
+ virtual MappedRange<const PxVec4> getCurrentParticles() const;
+ virtual MappedRange<PxVec4> getPreviousParticles();
+ virtual MappedRange<const PxVec4> getPreviousParticles() const;
+ virtual GpuParticles getGpuParticles();
+
+ virtual void setTranslation(const PxVec3& trans);
+ virtual void setRotation(const PxQuat& rot);
+
+ virtual const PxVec3& getTranslation() const;
+ virtual const PxQuat& getRotation() const;
+
+ virtual void clearInertia();
+
+ virtual void teleport(const PxVec3& delta);
+
+ virtual float getPreviousIterationDt() const;
+ virtual void setGravity(const PxVec3& gravity);
+ virtual PxVec3 getGravity() const;
+ virtual void setDamping(const PxVec3& damping);
+ virtual PxVec3 getDamping() const;
+ virtual void setLinearDrag(const PxVec3& drag);
+ virtual PxVec3 getLinearDrag() const;
+ virtual void setAngularDrag(const PxVec3& drag);
+ virtual PxVec3 getAngularDrag() const;
+ virtual void setLinearInertia(const PxVec3& inertia);
+ virtual PxVec3 getLinearInertia() const;
+ virtual void setAngularInertia(const PxVec3& inertia);
+ virtual PxVec3 getAngularInertia() const;
+ virtual void setCentrifugalInertia(const PxVec3& inertia);
+ virtual PxVec3 getCentrifugalInertia() const;
+
+ virtual void setSolverFrequency(float frequency);
+ virtual float getSolverFrequency() const;
+
+ virtual void setStiffnessFrequency(float frequency);
+ virtual float getStiffnessFrequency() const;
+
+ virtual void setAcceleationFilterWidth(uint32_t);
+ virtual uint32_t getAccelerationFilterWidth() const;
+
+ virtual void setPhaseConfig(Range<const PhaseConfig> configs);
+
+ virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last);
+ virtual uint32_t getNumSpheres() const;
+
+ virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last);
+ virtual uint32_t getNumCapsules() const;
+
+ virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last);
+ virtual uint32_t getNumPlanes() const;
+
+ virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last);
+ virtual uint32_t getNumConvexes() const;
+
+ virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last);
+ virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first);
+ virtual uint32_t getNumTriangles() const;
+
+ virtual bool isContinuousCollisionEnabled() const;
+ virtual void enableContinuousCollision(bool);
+
+ virtual float getCollisionMassScale() const;
+ virtual void setCollisionMassScale(float);
+ virtual void setFriction(float friction);
+ virtual float getFriction() const;
+
+ virtual void setVirtualParticles(Range<const uint32_t[4]>, Range<const PxVec3>);
+ virtual uint32_t getNumVirtualParticles() const;
+ virtual uint32_t getNumVirtualParticleWeights() const;
+
+ virtual void setTetherConstraintScale(float scale);
+ virtual float getTetherConstraintScale() const;
+ virtual void setTetherConstraintStiffness(float stiffness);
+ virtual float getTetherConstraintStiffness() const;
+
+ virtual Range<PxVec4> getMotionConstraints();
+ virtual void clearMotionConstraints();
+ virtual uint32_t getNumMotionConstraints() const;
+ virtual void setMotionConstraintScaleBias(float scale, float bias);
+ virtual float getMotionConstraintScale() const;
+ virtual float getMotionConstraintBias() const;
+ virtual void setMotionConstraintStiffness(float stiffness);
+ virtual float getMotionConstraintStiffness() const;
+
+ virtual Range<PxVec4> getSeparationConstraints();
+ virtual void clearSeparationConstraints();
+ virtual uint32_t getNumSeparationConstraints() const;
+
+ virtual void clearInterpolation();
+
+ virtual Range<PxVec4> getParticleAccelerations();
+ virtual void clearParticleAccelerations();
+ virtual uint32_t getNumParticleAccelerations() const;
+
+ virtual void setWindVelocity(PxVec3);
+ virtual PxVec3 getWindVelocity() const;
+ virtual void setDragCoefficient(float);
+ virtual float getDragCoefficient() const;
+ virtual void setLiftCoefficient(float);
+ virtual float getLiftCoefficient() const;
+
+ virtual void setSelfCollisionDistance(float);
+ virtual float getSelfCollisionDistance() const;
+ virtual void setSelfCollisionStiffness(float);
+ virtual float getSelfCollisionStiffness() const;
+
+ virtual void setSelfCollisionIndices(Range<const uint32_t>);
+ virtual uint32_t getNumSelfCollisionIndices() const;
+
+ virtual void setRestPositions(Range<const PxVec4>);
+ virtual uint32_t getNumRestPositions() const;
+
+ virtual const PxVec3& getBoundingBoxCenter() const;
+ virtual const PxVec3& getBoundingBoxScale() const;
+
+ virtual void setSleepThreshold(float);
+ virtual float getSleepThreshold() const;
+ virtual void setSleepTestInterval(uint32_t);
+ virtual uint32_t getSleepTestInterval() const;
+ virtual void setSleepAfterCount(uint32_t);
+ virtual uint32_t getSleepAfterCount() const;
+ virtual uint32_t getSleepPassCount() const;
+ virtual bool isAsleep() const;
+ virtual void putToSleep();
+ virtual void wakeUp();
+
+ virtual void setUserData(void*);
+ virtual void* getUserData() const;
+
+ // helper function
+ template <typename U>
+ MappedRange<U> getMappedParticles(U* data) const;
+
+ ClothType mCloth;
+};
+
+class SwCloth;
+typedef ClothImpl<SwCloth> SwClothImpl;
+
+class CuCloth;
+typedef ClothImpl<CuCloth> CuClothImpl;
+
+class DxCloth;
+typedef ClothImpl<DxCloth> DxClothImpl;
+
+template <typename T>
+ClothImpl<T>::ClothImpl(Factory& factory, Fabric& fabric, Range<const PxVec4> particles)
+: mCloth(static_cast<FactoryType&>(factory), static_cast<FabricType&>(fabric), particles)
+{
+ // fabric and cloth need to be created by the same factory
+ PX_ASSERT(&fabric.getFactory() == &factory);
+}
+
+template <typename T>
+ClothImpl<T>::ClothImpl(Factory& factory, const ClothImpl& impl)
+: mCloth(static_cast<FactoryType&>(factory), impl.mCloth)
+{
+}
+
+template <typename T>
+inline Fabric& ClothImpl<T>::getFabric() const
+{
+ return mCloth.mFabric;
+}
+
+template <typename T>
+inline Factory& ClothImpl<T>::getFactory() const
+{
+ return mCloth.mFactory;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTranslation(const PxVec3& trans)
+{
+ PxVec3 t = reinterpret_cast<const PxVec3&>(trans);
+ if(t == mCloth.mTargetMotion.p)
+ return;
+
+ mCloth.mTargetMotion.p = t;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setRotation(const PxQuat& q)
+{
+ if((q - mCloth.mTargetMotion.q).magnitudeSquared() == 0.0f)
+ return;
+
+ mCloth.mTargetMotion.q = q;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getTranslation() const
+{
+ return mCloth.mTargetMotion.p;
+}
+
+template <typename T>
+inline const PxQuat& ClothImpl<T>::getRotation() const
+{
+ return mCloth.mTargetMotion.q;
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearInertia()
+{
+ mCloth.mCurrentMotion = mCloth.mTargetMotion;
+ mCloth.mLinearVelocity = PxVec3(0.0f);
+ mCloth.mAngularVelocity = PxVec3(0.0f);
+
+ mCloth.wakeUp();
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::teleport(const PxVec3& delta)
+{
+ mCloth.mCurrentMotion.p += delta;
+ mCloth.mTargetMotion.p += delta;
+}
+
+template <typename T>
+inline float ClothImpl<T>::getPreviousIterationDt() const
+{
+ return mCloth.mPrevIterDt;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setGravity(const PxVec3& gravity)
+{
+ PxVec3 value = gravity;
+ if(value == mCloth.mGravity)
+ return;
+
+ mCloth.mGravity = value;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getGravity() const
+{
+ return mCloth.mGravity;
+}
+
+inline float safeLog2(float x)
+{
+ return x ? shdfnd::log2(x) : -FLT_MAX_EXP;
+}
+
+inline PxVec3 safeLog2(const PxVec3& v)
+{
+ return PxVec3(safeLog2(v.x), safeLog2(v.y), safeLog2(v.z));
+}
+
+inline float safeExp2(float x)
+{
+ if(x <= -FLT_MAX_EXP)
+ return 0.0f;
+ else
+ return shdfnd::exp2(x);
+}
+
+inline PxVec3 safeExp2(const PxVec3& v)
+{
+ return PxVec3(safeExp2(v.x), safeExp2(v.y), safeExp2(v.z));
+}
+
+template <typename T>
+inline void ClothImpl<T>::setDamping(const PxVec3& damping)
+{
+ PxVec3 value = safeLog2(PxVec3(1.f) - damping);
+ if(value == mCloth.mLogDamping)
+ return;
+
+ mCloth.mLogDamping = value;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getDamping() const
+{
+ return PxVec3(1.f) - safeExp2(mCloth.mLogDamping);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setLinearDrag(const PxVec3& drag)
+{
+ PxVec3 value = safeLog2(PxVec3(1.f) - drag);
+ if(value == mCloth.mLinearLogDrag)
+ return;
+
+ mCloth.mLinearLogDrag = value;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getLinearDrag() const
+{
+ return PxVec3(1.f) - safeExp2(mCloth.mLinearLogDrag);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAngularDrag(const PxVec3& drag)
+{
+ PxVec3 value = safeLog2(PxVec3(1.f) - drag);
+ if(value == mCloth.mAngularLogDrag)
+ return;
+
+ mCloth.mAngularLogDrag = value;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getAngularDrag() const
+{
+ return PxVec3(1.f) - safeExp2(mCloth.mAngularLogDrag);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setLinearInertia(const PxVec3& inertia)
+{
+ PxVec3 value = inertia;
+ if(value == mCloth.mLinearInertia)
+ return;
+
+ mCloth.mLinearInertia = value;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getLinearInertia() const
+{
+ return mCloth.mLinearInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAngularInertia(const PxVec3& inertia)
+{
+ PxVec3 value = inertia;
+ if(value == mCloth.mAngularInertia)
+ return;
+
+ mCloth.mAngularInertia = value;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getAngularInertia() const
+{
+ return mCloth.mAngularInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setCentrifugalInertia(const PxVec3& inertia)
+{
+ PxVec3 value = inertia;
+ if(value == mCloth.mCentrifugalInertia)
+ return;
+
+ mCloth.mCentrifugalInertia = value;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getCentrifugalInertia() const
+{
+ return mCloth.mCentrifugalInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSolverFrequency(float frequency)
+{
+ if(frequency == mCloth.mSolverFrequency)
+ return;
+
+ mCloth.mSolverFrequency = frequency;
+ mCloth.mIterDtAvg.reset();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSolverFrequency() const
+{
+ return mCloth.mSolverFrequency;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setStiffnessFrequency(float frequency)
+{
+ if(frequency == mCloth.mStiffnessFrequency)
+ return;
+
+ mCloth.mStiffnessFrequency = frequency;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getStiffnessFrequency() const
+{
+ return mCloth.mStiffnessFrequency;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAcceleationFilterWidth(uint32_t n)
+{
+ mCloth.mIterDtAvg.resize(n);
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getAccelerationFilterWidth() const
+{
+ return mCloth.mIterDtAvg.size();
+}
+
+// move a subarray
+template <typename Iter>
+void move(Iter it, uint32_t first, uint32_t last, uint32_t result)
+{
+ if(result > first)
+ {
+ result += last - first;
+ while(first < last)
+ it[--result] = it[--last];
+ }
+ else
+ {
+ while(first < last)
+ it[result++] = it[first++];
+ }
+}
+
+// update capsule index
+inline bool updateIndex(uint32_t& index, uint32_t first, int32_t delta)
+{
+ return index >= first && int32_t(index += delta) < int32_t(first);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSpheres(Range<const PxVec4> spheres, uint32_t first, uint32_t last)
+{
+ uint32_t oldSize = uint32_t(mCloth.mStartCollisionSpheres.size());
+ uint32_t newSize = uint32_t(spheres.size()) + oldSize - last + first;
+
+ PX_ASSERT(newSize <= 32);
+ PX_ASSERT(first <= oldSize);
+ PX_ASSERT(last <= oldSize);
+
+#if PX_DEBUG
+ for(const PxVec4* it = spheres.begin(); it < spheres.end(); ++it)
+ PX_ASSERT(it->w >= 0.0f);
+#endif
+
+ if(!oldSize && !newSize)
+ return;
+
+ if(!oldSize)
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mStartCollisionSpheres.assign(spheres.begin(), spheres.end());
+ mCloth.notifyChanged();
+ }
+ else
+ {
+ if(PxMax(oldSize, newSize) >
+ PxMin(mCloth.mStartCollisionSpheres.capacity(), mCloth.mTargetCollisionSpheres.capacity()))
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mStartCollisionSpheres.reserve(newSize);
+ mCloth.mTargetCollisionSpheres.reserve(PxMax(oldSize, newSize));
+ }
+
+ typename T::MappedVec4fVectorType start = mCloth.mStartCollisionSpheres;
+ typename T::MappedVec4fVectorType target = mCloth.mTargetCollisionSpheres;
+
+ // fill target from start
+ for(uint32_t i = target.size(); i < oldSize; ++i)
+ target.pushBack(start[i]);
+
+ // resize to larger of oldSize and newSize
+ start.resize(PxMax(oldSize, newSize), PxVec4(0.0f));
+ target.resize(PxMax(oldSize, newSize), PxVec4(0.0f));
+
+ if(int32_t delta = int32_t(newSize - oldSize))
+ {
+ // move past-range elements to new place
+ move(start.begin(), last, oldSize, last + delta);
+ move(target.begin(), last, oldSize, last + delta);
+
+ // fill new elements from spheres
+ for(uint32_t i = last; i < last + delta; ++i)
+ start[i] = spheres[i - first];
+
+ // adjust capsule indices
+ typename T::MappedIndexVectorType indices = mCloth.mCapsuleIndices;
+ Vector<IndexPair>::Type::Iterator cIt, cEnd = indices.end();
+ for(cIt = indices.begin(); cIt != cEnd;)
+ {
+ bool removed = false;
+ removed |= updateIndex(cIt->first, last + PxMin(0, delta), int32_t(delta));
+ removed |= updateIndex(cIt->second, last + PxMin(0, delta), int32_t(delta));
+ if(!removed)
+ ++cIt;
+ else
+ {
+ indices.replaceWithLast(cIt);
+ cEnd = indices.end();
+ }
+ }
+
+ start.resize(newSize);
+ target.resize(newSize);
+
+ mCloth.notifyChanged();
+ }
+
+ // fill target elements with spheres
+ for(uint32_t i = 0; i < spheres.size(); ++i)
+ target[first + i] = spheres[i];
+ }
+
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSpheres() const
+{
+ return uint32_t(mCloth.mStartCollisionSpheres.size());
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::setCapsules(Range<const uint32_t> capsules, uint32_t first, uint32_t last)
+{
+ uint32_t oldSize = mCloth.mCapsuleIndices.size();
+ uint32_t newSize = uint32_t(capsules.size() / 2) + oldSize - last + first;
+
+ PX_ASSERT(newSize <= 32);
+ PX_ASSERT(first <= oldSize);
+ PX_ASSERT(last <= oldSize);
+
+ const IndexPair* srcIndices = reinterpret_cast<const IndexPair*>(capsules.begin());
+
+ if(mCloth.mCapsuleIndices.capacity() < newSize)
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mCapsuleIndices.reserve(newSize);
+ }
+
+ // resize to larger of oldSize and newSize
+ mCloth.mCapsuleIndices.resize(PxMax(oldSize, newSize));
+
+ typename T::MappedIndexVectorType dstIndices = mCloth.mCapsuleIndices;
+
+ if(uint32_t delta = newSize - oldSize)
+ {
+ // move past-range elements to new place
+ move(dstIndices.begin(), last, oldSize, last + delta);
+
+ // fill new elements from capsules
+ for(uint32_t i = last; i < last + delta; ++i)
+ dstIndices[i] = srcIndices[i - first];
+
+ dstIndices.resize(newSize);
+ mCloth.notifyChanged();
+ }
+
+ // fill existing elements from capsules
+ for(uint32_t i = first; i < last; ++i)
+ dstIndices[i] = srcIndices[i - first];
+
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumCapsules() const
+{
+ return uint32_t(mCloth.mCapsuleIndices.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setPlanes(Range<const PxVec4> planes, uint32_t first, uint32_t last)
+{
+ uint32_t oldSize = uint32_t(mCloth.mStartCollisionPlanes.size());
+ uint32_t newSize = uint32_t(planes.size()) + oldSize - last + first;
+
+ PX_ASSERT(newSize <= 32);
+ PX_ASSERT(first <= oldSize);
+ PX_ASSERT(last <= oldSize);
+
+ if(!oldSize && !newSize)
+ return;
+
+ if(!oldSize)
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mStartCollisionPlanes.assign(planes.begin(), planes.end());
+ mCloth.notifyChanged();
+ }
+ else
+ {
+ if(PxMax(oldSize, newSize) >
+ PxMin(mCloth.mStartCollisionPlanes.capacity(), mCloth.mTargetCollisionPlanes.capacity()))
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mStartCollisionPlanes.reserve(newSize);
+ mCloth.mTargetCollisionPlanes.reserve(PxMax(oldSize, newSize));
+ }
+
+ // fill target from start
+ for(uint32_t i = mCloth.mTargetCollisionPlanes.size(); i < oldSize; ++i)
+ mCloth.mTargetCollisionPlanes.pushBack(mCloth.mStartCollisionPlanes[i]);
+
+ // resize to larger of oldSize and newSize
+ mCloth.mStartCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero);
+ mCloth.mTargetCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero);
+
+ if(int32_t delta = int32_t(newSize - oldSize))
+ {
+ // move past-range elements to new place
+ move(mCloth.mStartCollisionPlanes.begin(), last, oldSize, last + delta);
+ move(mCloth.mTargetCollisionPlanes.begin(), last, oldSize, last + delta);
+
+ // fill new elements from planes
+ for(uint32_t i = last; i < last + delta; ++i)
+ mCloth.mStartCollisionPlanes[i] = planes[i - first];
+
+ // adjust convex indices
+ uint32_t mask = (uint32_t(1) << (last + PxMin(delta, 0))) - 1;
+ Vector<uint32_t>::Type::Iterator cIt, cEnd = mCloth.mConvexMasks.end();
+ for(cIt = mCloth.mConvexMasks.begin(); cIt != cEnd;)
+ {
+ uint32_t convex = (*cIt & mask);
+ if(delta < 0)
+ convex |= *cIt >> -delta & ~mask;
+ else
+ convex |= (*cIt & ~mask) << delta;
+ if(convex)
+ *cIt++ = convex;
+ else
+ {
+ mCloth.mConvexMasks.replaceWithLast(cIt);
+ cEnd = mCloth.mConvexMasks.end();
+ }
+ }
+
+ mCloth.mStartCollisionPlanes.resize(newSize);
+ mCloth.mTargetCollisionPlanes.resize(newSize);
+
+ mCloth.notifyChanged();
+ }
+
+ // fill target elements with planes
+ for(uint32_t i = 0; i < planes.size(); ++i)
+ mCloth.mTargetCollisionPlanes[first + i] = planes[i];
+ }
+
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumPlanes() const
+{
+ return uint32_t(mCloth.mStartCollisionPlanes.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setConvexes(Range<const uint32_t> convexes, uint32_t first, uint32_t last)
+{
+ uint32_t oldSize = mCloth.mConvexMasks.size();
+ uint32_t newSize = uint32_t(convexes.size()) + oldSize - last + first;
+
+ PX_ASSERT(newSize <= 32);
+ PX_ASSERT(first <= oldSize);
+ PX_ASSERT(last <= oldSize);
+
+ if(mCloth.mConvexMasks.capacity() < newSize)
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mConvexMasks.reserve(newSize);
+ }
+
+ // resize to larger of oldSize and newSize
+ mCloth.mConvexMasks.resize(PxMax(oldSize, newSize));
+
+ if(uint32_t delta = newSize - oldSize)
+ {
+ // move past-range elements to new place
+ move(mCloth.mConvexMasks.begin(), last, oldSize, last + delta);
+
+ // fill new elements from capsules
+ for(uint32_t i = last; i < last + delta; ++i)
+ mCloth.mConvexMasks[i] = convexes[i - first];
+
+ mCloth.mConvexMasks.resize(newSize);
+ mCloth.notifyChanged();
+ }
+
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumConvexes() const
+{
+ return uint32_t(mCloth.mConvexMasks.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTriangles(Range<const PxVec3> triangles, uint32_t first, uint32_t last)
+{
+ // convert from triangle to vertex count
+ first *= 3;
+ last *= 3;
+
+ triangles = mCloth.clampTriangleCount(triangles, last - first);
+ PX_ASSERT(0 == triangles.size() % 3);
+
+ uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size());
+ uint32_t newSize = uint32_t(triangles.size()) + oldSize - last + first;
+
+ PX_ASSERT(first <= oldSize);
+ PX_ASSERT(last <= oldSize);
+
+ if(!oldSize && !newSize)
+ return;
+
+ if(!oldSize)
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mStartCollisionTriangles.assign(triangles.begin(), triangles.end());
+ mCloth.notifyChanged();
+ }
+ else
+ {
+ if(PxMax(oldSize, newSize) >
+ PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity()))
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mStartCollisionTriangles.reserve(newSize);
+ mCloth.mTargetCollisionTriangles.reserve(PxMax(oldSize, newSize));
+ }
+
+ // fill target from start
+ for(uint32_t i = mCloth.mTargetCollisionTriangles.size(); i < oldSize; ++i)
+ mCloth.mTargetCollisionTriangles.pushBack(mCloth.mStartCollisionTriangles[i]);
+
+ // resize to larger of oldSize and newSize
+ mCloth.mStartCollisionTriangles.resize(PxMax(oldSize, newSize));
+ mCloth.mTargetCollisionTriangles.resize(PxMax(oldSize, newSize));
+
+ if(uint32_t delta = newSize - oldSize)
+ {
+ // move past-range elements to new place
+ move(mCloth.mStartCollisionTriangles.begin(), last, oldSize, last + delta);
+ move(mCloth.mTargetCollisionTriangles.begin(), last, oldSize, last + delta);
+
+ // fill new elements from triangles
+ for(uint32_t i = last; i < last + delta; ++i)
+ mCloth.mStartCollisionTriangles[i] = triangles[i - first];
+
+ mCloth.mStartCollisionTriangles.resize(newSize);
+ mCloth.mTargetCollisionTriangles.resize(newSize);
+
+ mCloth.notifyChanged();
+ }
+
+ // fill target elements with triangles
+ for(uint32_t i = 0; i < triangles.size(); ++i)
+ mCloth.mTargetCollisionTriangles[first + i] = triangles[i];
+ }
+
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTriangles(Range<const PxVec3> startTriangles, Range<const PxVec3> targetTriangles,
+ uint32_t first)
+{
+ PX_ASSERT(startTriangles.size() == targetTriangles.size());
+
+ // convert from triangle to vertex count
+ first *= 3;
+
+ uint32_t last = uint32_t(mCloth.mStartCollisionTriangles.size());
+
+ startTriangles = mCloth.clampTriangleCount(startTriangles, last - first);
+ targetTriangles = mCloth.clampTriangleCount(targetTriangles, last - first);
+
+ uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size());
+ uint32_t newSize = uint32_t(startTriangles.size()) + oldSize - last + first;
+
+ PX_ASSERT(first <= oldSize);
+ PX_ASSERT(last == oldSize); // this path only supports replacing the tail
+
+ if(!oldSize && !newSize)
+ return;
+
+ if(newSize > PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity()))
+ {
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mStartCollisionTriangles.reserve(newSize);
+ mCloth.mTargetCollisionTriangles.reserve(newSize);
+ }
+
+ uint32_t retainSize = oldSize - last + first;
+ mCloth.mStartCollisionTriangles.resize(retainSize);
+ mCloth.mTargetCollisionTriangles.resize(retainSize);
+
+ for(uint32_t i = 0, n = startTriangles.size(); i < n; ++i)
+ {
+ mCloth.mStartCollisionTriangles.pushBack(startTriangles[i]);
+ mCloth.mTargetCollisionTriangles.pushBack(targetTriangles[i]);
+ }
+
+ if(newSize - oldSize)
+ mCloth.notifyChanged();
+
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumTriangles() const
+{
+ return uint32_t(mCloth.mStartCollisionTriangles.size()) / 3;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::isContinuousCollisionEnabled() const
+{
+ return mCloth.mEnableContinuousCollision;
+}
+
+template <typename T>
+inline void ClothImpl<T>::enableContinuousCollision(bool enable)
+{
+ if(enable == mCloth.mEnableContinuousCollision)
+ return;
+
+ mCloth.mEnableContinuousCollision = enable;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getCollisionMassScale() const
+{
+ return mCloth.mCollisionMassScale;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setCollisionMassScale(float scale)
+{
+ if(scale == mCloth.mCollisionMassScale)
+ return;
+
+ mCloth.mCollisionMassScale = scale;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setFriction(float friction)
+{
+ mCloth.mFriction = friction;
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getFriction() const
+{
+ return mCloth.mFriction;
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumVirtualParticleWeights() const
+{
+ return uint32_t(mCloth.mVirtualParticleWeights.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTetherConstraintScale(float scale)
+{
+ if(scale == mCloth.mTetherConstraintScale)
+ return;
+
+ mCloth.mTetherConstraintScale = scale;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getTetherConstraintScale() const
+{
+ return mCloth.mTetherConstraintScale;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTetherConstraintStiffness(float stiffness)
+{
+ float value = safeLog2(1 - stiffness);
+ if(value == mCloth.mTetherConstraintLogStiffness)
+ return;
+
+ mCloth.mTetherConstraintLogStiffness = value;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getTetherConstraintStiffness() const
+{
+ return 1 - safeExp2(mCloth.mTetherConstraintLogStiffness);
+}
+
+template <typename T>
+inline Range<PxVec4> ClothImpl<T>::getMotionConstraints()
+{
+ mCloth.wakeUp();
+ return mCloth.push(mCloth.mMotionConstraints);
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearMotionConstraints()
+{
+ mCloth.clear(mCloth.mMotionConstraints);
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumMotionConstraints() const
+{
+ return uint32_t(mCloth.mMotionConstraints.mStart.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setMotionConstraintScaleBias(float scale, float bias)
+{
+ if(scale == mCloth.mMotionConstraintScale && bias == mCloth.mMotionConstraintBias)
+ return;
+
+ mCloth.mMotionConstraintScale = scale;
+ mCloth.mMotionConstraintBias = bias;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintScale() const
+{
+ return mCloth.mMotionConstraintScale;
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintBias() const
+{
+ return mCloth.mMotionConstraintBias;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setMotionConstraintStiffness(float stiffness)
+{
+ float value = safeLog2(1 - stiffness);
+ if(value == mCloth.mMotionConstraintLogStiffness)
+ return;
+
+ mCloth.mMotionConstraintLogStiffness = value;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintStiffness() const
+{
+ return 1 - safeExp2(mCloth.mMotionConstraintLogStiffness);
+}
+
+template <typename T>
+inline Range<PxVec4> ClothImpl<T>::getSeparationConstraints()
+{
+ mCloth.wakeUp();
+ return mCloth.push(mCloth.mSeparationConstraints);
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearSeparationConstraints()
+{
+ mCloth.clear(mCloth.mSeparationConstraints);
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearInterpolation()
+{
+ if(!mCloth.mTargetCollisionSpheres.empty())
+ {
+ physx::shdfnd::swap(mCloth.mStartCollisionSpheres, mCloth.mTargetCollisionSpheres);
+ mCloth.mTargetCollisionSpheres.resize(0);
+ }
+ mCloth.mMotionConstraints.pop();
+ mCloth.mSeparationConstraints.pop();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSeparationConstraints() const
+{
+ return uint32_t(mCloth.mSeparationConstraints.mStart.size());
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumParticleAccelerations() const
+{
+ return uint32_t(mCloth.mParticleAccelerations.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setWindVelocity(PxVec3 wind)
+{
+ if(wind == mCloth.mWind)
+ return;
+
+ mCloth.mWind = wind;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getWindVelocity() const
+{
+ return mCloth.mWind;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setDragCoefficient(float coefficient)
+{
+ float value = safeLog2(1 - coefficient);
+ if(value == mCloth.mDragLogCoefficient)
+ return;
+
+ mCloth.mDragLogCoefficient = value;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getDragCoefficient() const
+{
+ return 1 - safeExp2(mCloth.mDragLogCoefficient);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setLiftCoefficient(float coefficient)
+{
+ float value = safeLog2(1 - coefficient);
+ if(value == mCloth.mLiftLogCoefficient)
+ return;
+
+ mCloth.mLiftLogCoefficient = value;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getLiftCoefficient() const
+{
+ return 1 - safeExp2(mCloth.mLiftLogCoefficient);
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSelfCollisionIndices() const
+{
+ return uint32_t(mCloth.mSelfCollisionIndices.size());
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::setRestPositions(Range<const PxVec4> restPositions)
+{
+ PX_ASSERT(restPositions.empty() || restPositions.size() == getNumParticles());
+ ContextLockType contextLock(mCloth.mFactory);
+ mCloth.mRestPositions.assign(restPositions.begin(), restPositions.end());
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumRestPositions() const
+{
+ return uint32_t(mCloth.mRestPositions.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSelfCollisionDistance(float distance)
+{
+ if(distance == mCloth.mSelfCollisionDistance)
+ return;
+
+ mCloth.mSelfCollisionDistance = distance;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSelfCollisionDistance() const
+{
+ return mCloth.mSelfCollisionDistance;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSelfCollisionStiffness(float stiffness)
+{
+ float value = safeLog2(1 - stiffness);
+ if(value == mCloth.mSelfCollisionLogStiffness)
+ return;
+
+ mCloth.mSelfCollisionLogStiffness = value;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSelfCollisionStiffness() const
+{
+ return 1 - safeExp2(mCloth.mSelfCollisionLogStiffness);
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getBoundingBoxCenter() const
+{
+ return mCloth.mParticleBoundsCenter;
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getBoundingBoxScale() const
+{
+ return mCloth.mParticleBoundsHalfExtent;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepThreshold(float threshold)
+{
+ if(threshold == mCloth.mSleepThreshold)
+ return;
+
+ mCloth.mSleepThreshold = threshold;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSleepThreshold() const
+{
+ return mCloth.mSleepThreshold;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepTestInterval(uint32_t interval)
+{
+ if(interval == mCloth.mSleepTestInterval)
+ return;
+
+ mCloth.mSleepTestInterval = interval;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepTestInterval() const
+{
+ return mCloth.mSleepTestInterval;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepAfterCount(uint32_t afterCount)
+{
+ if(afterCount == mCloth.mSleepAfterCount)
+ return;
+
+ mCloth.mSleepAfterCount = afterCount;
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepAfterCount() const
+{
+ return mCloth.mSleepAfterCount;
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepPassCount() const
+{
+ return mCloth.mSleepPassCounter;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::isAsleep() const
+{
+ return mCloth.isSleeping();
+}
+
+template <typename T>
+inline void ClothImpl<T>::putToSleep()
+{
+ mCloth.mSleepPassCounter = mCloth.mSleepAfterCount;
+}
+
+template <typename T>
+inline void ClothImpl<T>::wakeUp()
+{
+ mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setUserData(void* data)
+{
+ mCloth.mUserData = data;
+}
+
+template <typename T>
+inline void* ClothImpl<T>::getUserData() const
+{
+ return mCloth.mUserData;
+}
+
+template <typename T>
+template <typename U>
+inline MappedRange<U> ClothImpl<T>::getMappedParticles(U* data) const
+{
+ return MappedRange<U>(data, data + getNumParticles(), *this, &Cloth::lockParticles, &Cloth::unlockParticles);
+}
+
+} // namespace cloth
+
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Factory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/Factory.cpp
new file mode 100644
index 00000000..834093fa
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/Factory.cpp
@@ -0,0 +1,71 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwFactory.h"
+#include "PxPhysXConfig.h"
+
+// Factory.cpp gets included in both PhysXGPU and LowLevelCloth projects
+// CuFactory can only be created in PhysXGPU project
+#if defined(PX_PHYSX_GPU_EXPORTS) || PX_XBOXONE
+#define ENABLE_CUFACTORY PX_SUPPORT_GPU_PHYSX
+#else
+#define ENABLE_CUFACTORY 0
+#endif
+
+#if ENABLE_CUFACTORY
+#include "CuFactory.h"
+#endif
+
+namespace physx
+{
+namespace cloth
+{
+uint32_t getNextFabricId()
+{
+ static uint32_t sNextFabricId = 0;
+ return sNextFabricId++;
+}
+}
+}
+
+using namespace physx;
+
+cloth::Factory* cloth::Factory::createFactory(Platform platform, void* contextManager)
+{
+ PX_UNUSED(contextManager);
+
+ if(platform == Factory::CPU)
+ return new SwFactory;
+
+#if ENABLE_CUFACTORY
+ if(platform == Factory::CUDA)
+ return new CuFactory((physx::PxCudaContextManager*)contextManager);
+#endif
+ return 0;
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/IndexPair.h b/PhysX_3.4/Source/LowLevelCloth/src/IndexPair.h
new file mode 100644
index 00000000..78f153b1
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/IndexPair.h
@@ -0,0 +1,46 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+struct IndexPair
+{
+ uint32_t first;
+ uint32_t second;
+};
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/IterationState.h b/PhysX_3.4/Source/LowLevelCloth/src/IterationState.h
new file mode 100644
index 00000000..c9ad9293
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/IterationState.h
@@ -0,0 +1,403 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxTransform.h"
+#include "foundation/PxMat44.h"
+#include "Types.h"
+#include "Array.h"
+#include "Simd.h"
+#include "PsMathUtils.h"
+
+namespace physx
+{
+
+/* function object to perform solver iterations on one cloth */
+
+// todo: performance optimization: cache this object and test if velocity/iterDt has changed
+// c'tor takes about 5% of the iteration time of a 20x20 cloth
+
+namespace cloth
+{
+
+/* helper functions */
+
+template <typename T>
+T sqr(const T& x)
+{
+ return x * x;
+}
+
+inline PxVec3 log(const PxQuat& q)
+{
+ float theta = q.getImaginaryPart().magnitude();
+ float scale = theta > PX_EPS_REAL ? PxAsin(theta) / theta : 1.0f;
+ scale = intrinsics::fsel(q.w, scale, -scale);
+ return PxVec3(q.x * scale, q.y * scale, q.z * scale);
+}
+
+inline PxQuat exp(const PxVec3& v)
+{
+ float theta = v.magnitude();
+ float scale = theta > PX_EPS_REAL ? PxSin(theta) / theta : 1.0f;
+ return PxQuat(v.x * scale, v.y * scale, v.z * scale, PxCos(theta));
+}
+
+template <typename Simd4f, uint32_t N>
+inline void assign(Simd4f (&columns)[N], const PxMat44& matrix)
+{
+ for(uint32_t i = 0; i < N; ++i)
+ columns[i] = load(array(matrix[i]));
+}
+
+template <typename Simd4f>
+inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec)
+{
+ return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
+}
+
+template <typename Simd4f>
+inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec)
+{
+ return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
+}
+
+template <typename>
+struct IterationState; // forward declaration
+
+struct IterationStateFactory
+{
+ template <typename MyCloth>
+ IterationStateFactory(MyCloth& cloth, float frameDt);
+
+ template <typename Simd4f, typename MyCloth>
+ IterationState<Simd4f> create(MyCloth const& cloth) const;
+
+ template <typename Simd4f>
+ static Simd4f lengthSqr(Simd4f const& v)
+ {
+ return dot3(v, v);
+ }
+
+ template <typename Simd4f>
+ static PxVec3 castToPxVec3(const Simd4f& v)
+ {
+ return *reinterpret_cast<const PxVec3*>(reinterpret_cast<const char*>(&v));
+ }
+
+ int mNumIterations;
+ float mInvNumIterations;
+ float mIterDt, mIterDtRatio, mIterDtAverage;
+ PxQuat mCurrentRotation;
+ PxVec3 mPrevLinearVelocity;
+ PxVec3 mPrevAngularVelocity;
+};
+
+/* solver iterations helper functor */
+template <typename Simd4f>
+struct IterationState
+{
+ // call after each iteration
+ void update();
+
+ inline float getCurrentAlpha() const;
+ inline float getPreviousAlpha() const;
+
+ public:
+ Simd4f mRotationMatrix[3]; // should rename to 'mRotation'
+
+ Simd4f mCurBias; // in local space
+ Simd4f mPrevBias; // in local space
+ Simd4f mWind; // delta position per iteration
+
+ Simd4f mPrevMatrix[3];
+ Simd4f mCurMatrix[3];
+ Simd4f mDampScaleUpdate;
+
+ // iteration counter
+ uint32_t mRemainingIterations;
+
+ // reciprocal total number of iterations
+ float mInvNumIterations;
+
+ // time step size per iteration
+ float mIterDt;
+
+ bool mIsTurning; // if false, mPositionScale = mPrevMatrix[0]
+};
+
+} // namespace cloth
+
+template <typename Simd4f>
+inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const
+{
+ return getPreviousAlpha() + mInvNumIterations;
+}
+
+template <typename Simd4f>
+inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const
+{
+ return 1.0f - mRemainingIterations * mInvNumIterations;
+}
+
+template <typename MyCloth>
+cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameDt)
+{
+ mNumIterations = PxMax(1, int(frameDt * cloth.mSolverFrequency + 0.5f));
+ mInvNumIterations = 1.0f / mNumIterations;
+ mIterDt = frameDt * mInvNumIterations;
+
+ mIterDtRatio = cloth.mPrevIterDt ? mIterDt / cloth.mPrevIterDt : 1.0f;
+ mIterDtAverage = cloth.mIterDtAvg.empty() ? mIterDt : cloth.mIterDtAvg.average();
+
+ mCurrentRotation = cloth.mCurrentMotion.q;
+ mPrevLinearVelocity = cloth.mLinearVelocity;
+ mPrevAngularVelocity = cloth.mAngularVelocity;
+
+ // update cloth
+ float invFrameDt = 1.0f / frameDt;
+ cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p);
+ PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate();
+ cloth.mAngularVelocity = log(dq) * invFrameDt;
+
+ cloth.mPrevIterDt = mIterDt;
+ cloth.mIterDtAvg.push(static_cast<uint32_t>(mNumIterations), mIterDt);
+ cloth.mCurrentMotion = cloth.mTargetMotion;
+}
+
+/*
+momentum conservation:
+m2*x2 - m1*x1 = m1*x1 - m0*x0 + g*dt2, m = r+t
+r2*x2+t2 = 2(r1*x1+t1) - (r0*x0+t0) + g*dt2
+r2*x2 = r1*x1 + r1*x1 - r0*x0 - (t2-2t1+t0) + g*dt2
+substitue r1*x1 - r0*x0 = r1*(x1-x0) + (r1-r0)*x0
+and r1*x1 = r2*x1 - (r2-r1)*x1
+
+x2 = x1 + r2'*g*dt2
+ + r2'r1*(x1-x0) //< damp
+ + (r2'r1-r2'r0)*x0 - (1-r2'r1)*x1 - r2'*(t2-2t1+t0) //< inertia
+ + (1-r2'r1)x1 + t2-t1 //< drag (not momentum conserving)
+
+x2 = x0 + a0*x0 + a1*x1 + b with
+a0 = (inertia-damp)*r2'r1 - inertia*r2'r0 - eye
+a1 = (1-inertia-drag)*eye + (damp+inertia+drag)*r2'r1
+b = r2'*(g*dt2 - (inertia+drag)*(t2-t1) + inertia*(t1-t0))
+
+Velocities are used to deal with multiple iterations and varying dt. Only b needs
+to updated from one iteration to the next. Specifically, it is multiplied
+by (r2'r1)^1/numIterations. a0 and a1 are unaffected by that multiplication.
+
+The centrifugal and coriolis forces of non-inertial (turning) reference frame are
+not generally captured in these formulas. The 'inertia' term above contains radial
+acceleration plus centrifugal and coriolis force for a single iteration.
+For multiple iterations, or when the centrifugal forces are scaled differently
+than angular inertia, we need to add explicit centrifugal and coriolis forces.
+We only use them to correct the above formula because their discretization is
+not accurate.
+
+Possible improvements: multiply coriolis and centrifugal matrix by curInvRotation
+from the left. Do the alpha trick of linearInertia also for angularInertia, write
+prevParticle after multiplying it with matrix.
+
+If you change anything in this function, make sure that ClothCustomFloating and
+ClothInertia haven't regressed for any choice of solver frequency.
+*/
+
+template <typename Simd4f, typename MyCloth>
+cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const
+{
+ IterationState<Simd4f> result;
+
+ result.mRemainingIterations = static_cast<uint32_t>(mNumIterations);
+ result.mInvNumIterations = mInvNumIterations;
+ result.mIterDt = mIterDt;
+
+ Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity));
+ Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity));
+
+ Simd4f iterDt = simd4f(mIterDt);
+ Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt;
+
+ Simd4f translation = iterDt * curLinearVelocity;
+
+ // gravity delta per iteration
+ Simd4f gravity = load(array(cloth.mGravity)) * static_cast<Simd4f>(simd4f(sqr(mIterDtAverage)));
+
+ // scale of local particle velocity per iteration
+ Simd4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent);
+ // adjust for the change in time step during the first iteration
+ Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio);
+
+ // portion of negative frame velocity to transfer to particle
+ Simd4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation;
+
+ // portion of frame acceleration to transfer to particle
+ Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity);
+
+ // for inertia, we want to violate newton physics to
+ // match velocity and position as given by the user, which means:
+ // vt = v0 + a*t and xt = x0 + v0*t + (!) a*t^2
+ // this is achieved by applying a different portion to cur and prev
+ // position, compared to the normal +0.5 and -0.5 for '... 1/2 a*t^2'.
+ // specifically, the portion is alpha=(n+1)/2n and 1-alpha.
+
+ float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations;
+ Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha);
+
+ // rotate to local space (use mRotationMatrix temporarily to hold matrix)
+ PxMat44 invRotation(mCurrentRotation.getConjugate());
+ assign(result.mRotationMatrix, invRotation);
+
+ Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+
+ // Previously, we split the bias between previous and current position to
+ // get correct disretized position and velocity. However, this made a
+ // hanging cloth experience a downward velocity, which is problematic
+ // when scaled by the iterDt ratio and results in jitter under variable
+ // timesteps. Instead, we now apply the entire bias to current position
+ // and accept a less noticeable error for a free falling cloth.
+
+ Simd4f bias = gravity - linearDrag;
+ result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ;
+ result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ;
+
+ Simd4f wind = load(array(cloth.mWind)) * iterDt;
+ result.mWind = transform(result.mRotationMatrix, translation - wind) & maskXYZ;
+
+ result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f;
+
+ if(result.mIsTurning)
+ {
+ Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity)));
+ Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity)));
+
+ // rotation for one iteration in local space
+ Simd4f curInvAngle = -iterDt * curAngularVelocity;
+ Simd4f prevInvAngle = -iterDt * prevAngularVelocity;
+
+ PxQuat curInvRotation = exp(castToPxVec3(curInvAngle));
+ PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle));
+
+ PxMat44 curMatrix(curInvRotation);
+ PxMat44 prevMatrix(prevInvRotation * curInvRotation);
+
+ assign(result.mRotationMatrix, curMatrix);
+
+ Simd4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent);
+ Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia));
+ Simd4f angularInertia = load(array(cloth.mAngularInertia));
+ Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity;
+
+ Simd4f epsilon = simd4f(PxSqrt(FLT_MIN)); // requirement: sqr(epsilon) > 0
+ Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon;
+ Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon;
+ Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon;
+ Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon;
+ Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon;
+
+ float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0];
+ float inertiaScale =
+ mInvNumIterations * array(rsqrt(accelerationLengthSqr * inertiaLengthSqr) * inertiaLengthSqr)[0];
+
+ // magic factor found by comparing to global space simulation:
+ // some centrifugal force is in inertia part, remainder is 2*(n-1)/n
+ // after scaling the inertia part, we get for centrifugal:
+ float centrifugalAlpha = (2 * mNumIterations - 1) * mInvNumIterations;
+ float centrifugalScale =
+ centrifugalAlpha * array(rsqrt(velocityLengthSqr * centrifugalLengthSqr) * centrifugalLengthSqr)[0] -
+ inertiaScale;
+
+ // slightly better in ClothCustomFloating than curInvAngle alone
+ Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f);
+ const Simd4f data = lengthSqr(centrifugalVelocity);
+ float centrifugalSqrLength = array(data)[0] * centrifugalScale;
+
+ Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale);
+ PxMat33 coriolisMatrix = shdfnd::star(castToPxVec3(coriolisVelocity));
+
+ const float* dampScalePtr = array(firstDampScale);
+ const float* centrifugalPtr = array(centrifugalVelocity);
+
+ for(unsigned int j = 0; j < 3; ++j)
+ {
+ float centrifugalJ = -centrifugalPtr[j] * centrifugalScale;
+ for(unsigned int i = 0; i < 3; ++i)
+ {
+ float damping = dampScalePtr[j];
+ float coriolis = coriolisMatrix(i, j);
+ float centrifugal = centrifugalPtr[i] * centrifugalJ;
+
+ prevMatrix(i, j) = centrifugal - coriolis + curMatrix(i, j) * (inertiaScale - damping) -
+ prevMatrix(i, j) * inertiaScale;
+ curMatrix(i, j) = centrifugal + coriolis + curMatrix(i, j) * (inertiaScale + damping + dragScale);
+ }
+ curMatrix(j, j) += centrifugalSqrLength - inertiaScale - dragScale;
+ prevMatrix(j, j) += centrifugalSqrLength;
+ }
+
+ assign(result.mPrevMatrix, prevMatrix);
+ assign(result.mCurMatrix, curMatrix);
+ }
+ else
+ {
+ Simd4f minusOne = -static_cast<Simd4f>(gSimd4fOne);
+ result.mRotationMatrix[0] = minusOne;
+ result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne);
+ }
+
+ // difference of damp scale between first and other iterations
+ result.mDampScaleUpdate = (dampScale - firstDampScale) & maskXYZ;
+
+ return result;
+}
+
+template <typename Simd4f>
+void cloth::IterationState<Simd4f>::update()
+{
+ if(mIsTurning)
+ {
+ // only need to turn bias, matrix is unaffected (todo: verify)
+ mCurBias = transform(mRotationMatrix, mCurBias);
+ mPrevBias = transform(mRotationMatrix, mPrevBias);
+ mWind = transform(mRotationMatrix, mWind);
+ }
+
+ // remove time step ratio in damp scale after first iteration
+ for(uint32_t i = 0; i < 3; ++i)
+ {
+ mPrevMatrix[i] = mPrevMatrix[i] - mRotationMatrix[i] * mDampScaleUpdate;
+ mCurMatrix[i] = mCurMatrix[i] + mRotationMatrix[i] * mDampScaleUpdate;
+ }
+ mDampScaleUpdate = gSimd4fZero; // only once
+
+ --mRemainingIterations;
+}
+
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/MovingAverage.h b/PhysX_3.4/Source/LowLevelCloth/src/MovingAverage.h
new file mode 100644
index 00000000..45d33322
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/MovingAverage.h
@@ -0,0 +1,145 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Allocator.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+struct MovingAverage
+{
+ struct Element
+ {
+ uint32_t mCount;
+ float mValue;
+ };
+
+ public:
+ MovingAverage(uint32_t n = 1) : mCount(0), mSize(n)
+ {
+ }
+
+ bool empty() const
+ {
+ return mData.empty();
+ }
+
+ uint32_t size() const
+ {
+ return mSize;
+ }
+
+ void resize(uint32_t n)
+ {
+ PX_ASSERT(n);
+ mSize = n;
+ trim();
+ }
+
+ void reset()
+ {
+ mData.resize(0);
+ mCount = 0;
+ }
+
+ void push(uint32_t n, float value)
+ {
+ n = PxMin(n, mSize);
+
+ if(mData.empty() || mData.back().mValue != value)
+ {
+ Element element = { n, value };
+ mData.pushBack(element);
+ }
+ else
+ {
+ mData.back().mCount += n;
+ }
+
+ mCount += n;
+ trim();
+ }
+
+ float average() const
+ {
+ PX_ASSERT(!mData.empty());
+
+ float sum = 0.0f;
+ Vector<Element>::Type::ConstIterator it = mData.begin(), end = mData.end();
+ for(; it != end; ++it)
+ sum += it->mCount * it->mValue;
+
+ // linear weight ramps at both ends for smoother average
+ uint32_t n = mCount / 8;
+ float ramp = 0.0f, temp = 0.0f;
+ uint32_t countLo = (it = mData.begin())->mCount;
+ uint32_t countHi = (--end)->mCount;
+ for(uint32_t i = 0; i < n; ++i)
+ {
+ if(i == countLo)
+ countLo += (++it)->mCount;
+ if(i == countHi)
+ countHi += (--end)->mCount;
+
+ temp += it->mValue + end->mValue;
+ ramp += temp;
+ }
+
+ uint32_t num = (mCount - n) * (n + 1);
+ return (sum * (n + 1) - ramp) / num;
+ }
+
+ private:
+ // remove oldest (front) values until mCount<=mSize
+ void trim()
+ {
+ Vector<Element>::Type::Iterator it = mData.begin();
+ for(uint32_t k = mSize; k < mCount; it += k <= mCount)
+ {
+ k += it->mCount;
+ it->mCount = k - mCount;
+ }
+
+ if(it != mData.begin())
+ mData.assign(it, mData.end());
+
+ mCount = PxMin(mCount, mSize);
+ }
+
+ Vector<Element>::Type mData;
+
+ uint32_t mCount;
+ uint32_t mSize;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/PhaseConfig.cpp b/PhysX_3.4/Source/LowLevelCloth/src/PhaseConfig.cpp
new file mode 100644
index 00000000..354f445e
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/PhaseConfig.cpp
@@ -0,0 +1,75 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PhaseConfig.h"
+#include "PsMathUtils.h"
+
+namespace physx
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&);
+}
+}
+
+using namespace physx;
+
+namespace
+{
+float safeLog2(float x)
+{
+ float saturated = PxMax(0.0f, PxMin(x, 1.0f));
+ return saturated ? shdfnd::log2(saturated) : -FLT_MAX_EXP;
+}
+}
+
+cloth::PhaseConfig::PhaseConfig(uint16_t index)
+: mPhaseIndex(index)
+, mPadding(0xffff)
+, mStiffness(1.0f)
+, mStiffnessMultiplier(1.0f)
+, mCompressionLimit(1.0f)
+, mStretchLimit(1.0f)
+{
+}
+
+// convert from user input to solver format
+cloth::PhaseConfig cloth::transform(const PhaseConfig& config)
+{
+ PhaseConfig result(config.mPhaseIndex);
+
+ result.mStiffness = safeLog2(1.0f - config.mStiffness);
+ result.mStiffnessMultiplier = safeLog2(config.mStiffnessMultiplier);
+
+ // negative for compression, positive for stretch
+ result.mCompressionLimit = 1 - 1 / config.mCompressionLimit;
+ result.mStretchLimit = 1 - 1 / config.mStretchLimit;
+
+ return result;
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/PointInterpolator.h b/PhysX_3.4/Source/LowLevelCloth/src/PointInterpolator.h
new file mode 100644
index 00000000..b86c7442
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/PointInterpolator.h
@@ -0,0 +1,168 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+// acts as a poor mans random access iterator
+template <typename Simd4f, typename BaseIterator>
+class LerpIterator
+{
+
+ LerpIterator& operator=(const LerpIterator&); // not implemented
+
+ public:
+ LerpIterator(BaseIterator start, BaseIterator target, float alpha)
+ : mAlpha(simd4f(alpha)), mStart(start), mTarget(target)
+ {
+ }
+
+ // return the interpolated point at a given index
+ inline Simd4f operator[](size_t index) const
+ {
+ return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha;
+ }
+
+ inline Simd4f operator*() const
+ {
+ return (*this)[0];
+ }
+
+ // prefix increment only
+ inline LerpIterator& operator++()
+ {
+ ++mStart;
+ ++mTarget;
+ return *this;
+ }
+
+ private:
+ // interpolation parameter
+ const Simd4f mAlpha;
+
+ BaseIterator mStart;
+ BaseIterator mTarget;
+};
+
+template <typename Simd4f, size_t Stride>
+class UnalignedIterator
+{
+
+ UnalignedIterator& operator=(const UnalignedIterator&); // not implemented
+
+ public:
+ UnalignedIterator(const float* pointer) : mPointer(pointer)
+ {
+ }
+
+ inline Simd4f operator[](size_t index) const
+ {
+ return load(mPointer + index * Stride);
+ }
+
+ inline Simd4f operator*() const
+ {
+ return (*this)[0];
+ }
+
+ // prefix increment only
+ inline UnalignedIterator& operator++()
+ {
+ mPointer += Stride;
+ return *this;
+ }
+
+ private:
+ const float* mPointer;
+};
+
+// acts as an iterator but returns a constant
+template <typename Simd4f>
+class ConstantIterator
+{
+ public:
+ ConstantIterator(const Simd4f& value) : mValue(value)
+ {
+ }
+
+ inline Simd4f operator*() const
+ {
+ return mValue;
+ }
+
+ inline ConstantIterator& operator++()
+ {
+ return *this;
+ }
+
+ private:
+ ConstantIterator& operator=(const ConstantIterator&);
+ const Simd4f mValue;
+};
+
+// wraps an iterator with constant scale and bias
+template <typename Simd4f, typename BaseIterator>
+class ScaleBiasIterator
+{
+ public:
+ ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias)
+ : mScale(scale), mBias(bias), mBaseIterator(base)
+ {
+ }
+
+ inline Simd4f operator*() const
+ {
+ return (*mBaseIterator) * mScale + mBias;
+ }
+
+ inline ScaleBiasIterator& operator++()
+ {
+ ++mBaseIterator;
+ return *this;
+ }
+
+ private:
+ ScaleBiasIterator& operator=(const ScaleBiasIterator&);
+
+ const Simd4f mScale;
+ const Simd4f mBias;
+
+ BaseIterator mBaseIterator;
+};
+
+} // namespace cloth
+
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Simd.h b/PhysX_3.4/Source/LowLevelCloth/src/Simd.h
new file mode 100644
index 00000000..299ea2a9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/Simd.h
@@ -0,0 +1,43 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// cloth solver is 50% slower (!) on MSVC 11 and earlier when Simd4f lives in a namespace
+#define NV_SIMD_USE_NAMESPACE 0
+
+#include "NvSimd4f.h"
+#include "NvSimd4i.h"
+
+namespace physx
+{
+#if NV_SIMD_USE_NAMESPACE
+using namespace nvidia::simd;
+#endif
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/StackAllocator.h b/PhysX_3.4/Source/LowLevelCloth/src/StackAllocator.h
new file mode 100644
index 00000000..eb8d8679
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/StackAllocator.h
@@ -0,0 +1,155 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxAssert.h"
+
+#if PX_LINUX_FAMILY
+#include <stdint.h> // intptr_t
+#endif
+
+template <size_t align>
+class StackAllocator
+{
+ typedef unsigned char byte;
+
+ // todo: switch to offsets so size is consistent on x64
+ // mSize is just for book keeping so could be 4 bytes
+ struct Header
+ {
+ Header* mPrev;
+ size_t mSize : 31;
+ size_t mFree : 1;
+ };
+
+ StackAllocator(const StackAllocator&);
+ StackAllocator& operator=(const StackAllocator&);
+
+ public:
+ StackAllocator(void* buffer, size_t bufferSize)
+ : mBuffer(reinterpret_cast<byte*>(buffer)), mBufferSize(bufferSize), mFreeStart(mBuffer), mTop(0)
+ {
+ }
+
+ ~StackAllocator()
+ {
+ PX_ASSERT(userBytes() == 0);
+ }
+
+ void* allocate(size_t numBytes)
+ {
+ // this is non-standard
+ if(!numBytes)
+ return 0;
+
+ uintptr_t unalignedStart = uintptr_t(mFreeStart) + sizeof(Header);
+
+ byte* allocStart = reinterpret_cast<byte*>((unalignedStart + (align - 1)) & ~(align - 1));
+ byte* allocEnd = allocStart + numBytes;
+
+ // ensure there is space for the alloc
+ PX_ASSERT(allocEnd <= mBuffer + mBufferSize);
+
+ Header* h = getHeader(allocStart);
+ h->mPrev = mTop;
+ h->mSize = numBytes;
+ h->mFree = false;
+
+ mTop = h;
+ mFreeStart = allocEnd;
+
+ return allocStart;
+ }
+
+ void deallocate(void* p)
+ {
+ if(!p)
+ return;
+
+ Header* h = getHeader(p);
+ h->mFree = true;
+
+ // unwind the stack to the next live alloc
+ while(mTop && mTop->mFree)
+ {
+ mFreeStart = reinterpret_cast<byte*>(mTop);
+ mTop = mTop->mPrev;
+ }
+ }
+
+ private:
+ // return the header for an allocation
+ inline Header* getHeader(void* p) const
+ {
+ PX_ASSERT((reinterpret_cast<uintptr_t>(p) & (align - 1)) == 0);
+ PX_ASSERT(reinterpret_cast<byte*>(p) >= mBuffer + sizeof(Header));
+ PX_ASSERT(reinterpret_cast<byte*>(p) < mBuffer + mBufferSize);
+
+ return reinterpret_cast<Header*>(p) - 1;
+ }
+
+ public:
+ // total user-allocated bytes not including any overhead
+ size_t userBytes() const
+ {
+ size_t total = 0;
+ Header* iter = mTop;
+ while(iter)
+ {
+ total += iter->mSize;
+ iter = iter->mPrev;
+ }
+
+ return total;
+ }
+
+ // total user-allocated bytes + overhead
+ size_t totalUsedBytes() const
+ {
+ return mFreeStart - mBuffer;
+ }
+
+ size_t remainingBytes() const
+ {
+ return mBufferSize - totalUsedBytes();
+ }
+
+ size_t wastedBytes() const
+ {
+ return totalUsedBytes() - userBytes();
+ }
+
+ private:
+ byte* const mBuffer;
+ const size_t mBufferSize;
+
+ byte* mFreeStart; // start of free space
+ Header* mTop; // top allocation header
+};
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.cpp
new file mode 100644
index 00000000..1f3d4c90
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.cpp
@@ -0,0 +1,305 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwCloth.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "TripletScheduler.h"
+#include "ClothBase.h"
+#include "PsUtilities.h"
+
+namespace physx
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp
+}
+}
+
+using namespace physx;
+using namespace shdfnd;
+
+cloth::SwCloth::SwCloth(SwFactory& factory, SwFabric& fabric, Range<const PxVec4> particles)
+: mFactory(factory), mFabric(fabric), mNumVirtualParticles(0), mUserData(0)
+{
+ PX_ASSERT(!particles.empty());
+
+ initialize(*this, particles.begin(), particles.end());
+
+#if PX_WINDOWS
+ const uint32_t kSimdWidth = 8; // avx
+#else
+ const uint32_t kSimdWidth = 4; // sse
+#endif
+
+ mCurParticles.reserve(particles.size() + kSimdWidth - 1);
+ mCurParticles.assign(reinterpret_cast<const PxVec4*>(particles.begin()),
+ reinterpret_cast<const PxVec4*>(particles.end()));
+
+ // 7 dummy particles used in SIMD solver
+ mCurParticles.resize(particles.size() + kSimdWidth - 1, PxVec4(0.0f));
+ mPrevParticles = mCurParticles;
+
+ mCurParticles.resize(particles.size());
+ mPrevParticles.resize(particles.size());
+
+ mFabric.incRefCount();
+}
+
+namespace
+{
+// copy vector and make same capacity
+void copyVector(cloth::Vec4fAlignedVector& dst, const cloth::Vec4fAlignedVector& src)
+{
+ dst.reserve(src.capacity());
+ dst.assign(src.begin(), src.end());
+
+ // ensure valid dummy data
+ dst.resize(src.capacity(), PxVec4(0.0f));
+ dst.resize(src.size());
+}
+}
+
+// copy constructor, supports rebinding to a different factory
+cloth::SwCloth::SwCloth(SwFactory& factory, const SwCloth& cloth)
+: mFactory(factory)
+, mFabric(cloth.mFabric)
+, mPhaseConfigs(cloth.mPhaseConfigs)
+, mCapsuleIndices(cloth.mCapsuleIndices)
+, mStartCollisionSpheres(cloth.mStartCollisionSpheres)
+, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres)
+, mStartCollisionPlanes(cloth.mStartCollisionPlanes)
+, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes)
+, mStartCollisionTriangles(cloth.mStartCollisionTriangles)
+, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles)
+, mVirtualParticleIndices(cloth.mVirtualParticleIndices)
+, mVirtualParticleWeights(cloth.mVirtualParticleWeights)
+, mNumVirtualParticles(cloth.mNumVirtualParticles)
+, mSelfCollisionIndices(cloth.mSelfCollisionIndices)
+, mRestPositions(cloth.mRestPositions)
+{
+ copy(*this, cloth);
+
+ // carry over capacity (using as dummy particles)
+ copyVector(mCurParticles, cloth.mCurParticles);
+ copyVector(mPrevParticles, cloth.mPrevParticles);
+ copyVector(mMotionConstraints.mStart, cloth.mMotionConstraints.mStart);
+ copyVector(mMotionConstraints.mTarget, cloth.mMotionConstraints.mTarget);
+ copyVector(mSeparationConstraints.mStart, cloth.mSeparationConstraints.mStart);
+ copyVector(mSeparationConstraints.mTarget, cloth.mSeparationConstraints.mTarget);
+ copyVector(mParticleAccelerations, cloth.mParticleAccelerations);
+
+ mFabric.incRefCount();
+}
+
+cloth::SwCloth::~SwCloth()
+{
+ mFabric.decRefCount();
+}
+
+cloth::Range<PxVec4> cloth::SwCloth::push(SwConstraints& constraints)
+{
+ uint32_t n = mCurParticles.size();
+
+ if(!constraints.mTarget.capacity())
+ constraints.mTarget.resize((n + 3) & ~3, PxVec4(0.0f)); // reserve multiple of 4 for SIMD
+
+ constraints.mTarget.resizeUninitialized(n);
+ PxVec4* data = &constraints.mTarget.front();
+ Range<PxVec4> result(data, data + constraints.mTarget.size());
+
+ if(constraints.mStart.empty()) // initialize start first
+ constraints.mStart.swap(constraints.mTarget);
+
+ return result;
+}
+
+void cloth::SwCloth::clear(SwConstraints& constraints)
+{
+ Vec4fAlignedVector().swap(constraints.mStart);
+ Vec4fAlignedVector().swap(constraints.mTarget);
+}
+
+cloth::Range<const PxVec3> cloth::SwCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t)
+{
+ return range;
+}
+
+#include "ClothImpl.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+template <>
+Cloth* ClothImpl<SwCloth>::clone(Factory& factory) const
+{
+ return factory.clone(*this);
+}
+
+template <>
+uint32_t ClothImpl<SwCloth>::getNumParticles() const
+{
+ return mCloth.mCurParticles.size();
+}
+
+template <>
+void ClothImpl<SwCloth>::lockParticles() const
+{
+}
+
+template <>
+void ClothImpl<SwCloth>::unlockParticles() const
+{
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<SwCloth>::getCurrentParticles()
+{
+ return getMappedParticles(&mCloth.mCurParticles.front());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<SwCloth>::getCurrentParticles() const
+{
+ return getMappedParticles(&mCloth.mCurParticles.front());
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<SwCloth>::getPreviousParticles()
+{
+ return getMappedParticles(&mCloth.mPrevParticles.front());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<SwCloth>::getPreviousParticles() const
+{
+ return getMappedParticles(&mCloth.mPrevParticles.front());
+}
+
+template <>
+GpuParticles ClothImpl<SwCloth>::getGpuParticles()
+{
+ GpuParticles result = { 0, 0, 0 };
+ return result;
+}
+
+template <>
+void ClothImpl<SwCloth>::setPhaseConfig(Range<const PhaseConfig> configs)
+{
+ mCloth.mPhaseConfigs.resize(0);
+
+ // transform phase config to use in solver
+ for(; !configs.empty(); configs.popFront())
+ if(configs.front().mStiffness > 0.0f)
+ mCloth.mPhaseConfigs.pushBack(transform(configs.front()));
+
+ mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<SwCloth>::setSelfCollisionIndices(Range<const uint32_t> indices)
+{
+ ContextLockType lock(mCloth.mFactory);
+ mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end());
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <>
+uint32_t ClothImpl<SwCloth>::getNumVirtualParticles() const
+{
+ return uint32_t(mCloth.mNumVirtualParticles);
+}
+
+template <>
+Range<PxVec4> ClothImpl<SwCloth>::getParticleAccelerations()
+{
+ if(mCloth.mParticleAccelerations.empty())
+ {
+ uint32_t n = mCloth.mCurParticles.size();
+ mCloth.mParticleAccelerations.resize(n, PxVec4(0.0f));
+ }
+
+ mCloth.wakeUp();
+
+ PxVec4* data = &mCloth.mParticleAccelerations.front();
+ return Range<PxVec4>(data, data + mCloth.mParticleAccelerations.size());
+}
+
+template <>
+void ClothImpl<SwCloth>::clearParticleAccelerations()
+{
+ Vec4fAlignedVector().swap(mCloth.mParticleAccelerations);
+ mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<SwCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights)
+{
+ mCloth.mNumVirtualParticles = 0;
+
+ // shuffle indices to form independent SIMD sets
+ uint16_t numParticles = uint16_t(mCloth.mCurParticles.size());
+ TripletScheduler scheduler(indices);
+ scheduler.simd(numParticles, 4);
+
+ // convert indices to byte offset
+ Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0);
+ Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin();
+ Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end();
+ TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast;
+ mCloth.mVirtualParticleIndices.resize(0);
+ mCloth.mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt));
+ for(; sIt != sEnd; ++sIt)
+ {
+ uint32_t setSize = *sIt;
+ for(tLast = tIt + setSize; tIt != tLast; ++tIt, ++mCloth.mNumVirtualParticles)
+ mCloth.mVirtualParticleIndices.pushBack(Vec4us(*tIt));
+ mCloth.mVirtualParticleIndices.resize((mCloth.mVirtualParticleIndices.size() + 3) & ~3, dummy);
+ }
+ Vector<Vec4us>::Type(mCloth.mVirtualParticleIndices.begin(), mCloth.mVirtualParticleIndices.end())
+ .swap(mCloth.mVirtualParticleIndices);
+
+ // precompute 1/dot(w,w)
+ Vec4fAlignedVector().swap(mCloth.mVirtualParticleWeights);
+ mCloth.mVirtualParticleWeights.reserve(weights.size());
+ for(; !weights.empty(); weights.popFront())
+ {
+ PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front());
+ PxReal scale = 1 / w.magnitudeSquared();
+ mCloth.mVirtualParticleWeights.pushBack(PxVec4(w.x, w.y, w.z, scale));
+ }
+
+ mCloth.notifyChanged();
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.h b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.h
new file mode 100644
index 00000000..05db19d2
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.h
@@ -0,0 +1,210 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxTransform.h"
+#include "Cloth.h"
+#include "Range.h"
+#include "MovingAverage.h"
+#include "PhaseConfig.h"
+#include "IndexPair.h"
+#include "Vec4T.h"
+#include "Array.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class SwFabric;
+class SwFactory;
+
+typedef AlignedVector<PxVec4, 16>::Type Vec4fAlignedVector;
+
+struct SwConstraints
+{
+ void pop()
+ {
+ if(!mTarget.empty())
+ {
+ mStart.swap(mTarget);
+ mTarget.resize(0);
+ }
+ }
+
+ Vec4fAlignedVector mStart;
+ Vec4fAlignedVector mTarget;
+};
+
+class SwCloth
+{
+ SwCloth& operator=(const SwCloth&); // not implemented
+ struct SwContextLock
+ {
+ SwContextLock(const SwFactory&)
+ {
+ }
+ };
+
+ public:
+ typedef SwFactory FactoryType;
+ typedef SwFabric FabricType;
+ typedef SwContextLock ContextLockType;
+
+ typedef Vec4fAlignedVector& MappedVec4fVectorType;
+ typedef Vector<IndexPair>::Type& MappedIndexVectorType;
+
+ SwCloth(SwFactory&, SwFabric&, Range<const PxVec4>);
+ SwCloth(SwFactory&, const SwCloth&);
+ ~SwCloth(); // not virtual on purpose
+
+ public:
+ bool isSleeping() const
+ {
+ return mSleepPassCounter >= mSleepAfterCount;
+ }
+ void wakeUp()
+ {
+ mSleepPassCounter = 0;
+ }
+
+ void notifyChanged()
+ {
+ }
+
+ void setParticleBounds(const float*);
+
+ Range<PxVec4> push(SwConstraints&);
+ static void clear(SwConstraints&);
+
+ static Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t);
+
+ public:
+ SwFactory& mFactory;
+ SwFabric& mFabric;
+
+ // current and previous-iteration particle positions
+ Vec4fAlignedVector mCurParticles;
+ Vec4fAlignedVector mPrevParticles;
+
+ PxVec3 mParticleBoundsCenter;
+ PxVec3 mParticleBoundsHalfExtent;
+
+ PxVec3 mGravity;
+ PxVec3 mLogDamping;
+ PxVec3 mLinearLogDrag;
+ PxVec3 mAngularLogDrag;
+ PxVec3 mLinearInertia;
+ PxVec3 mAngularInertia;
+ PxVec3 mCentrifugalInertia;
+ float mSolverFrequency;
+ float mStiffnessFrequency;
+
+ PxTransform mTargetMotion;
+ PxTransform mCurrentMotion;
+ PxVec3 mLinearVelocity;
+ PxVec3 mAngularVelocity;
+
+ float mPrevIterDt;
+ MovingAverage mIterDtAvg;
+
+ Vector<PhaseConfig>::Type mPhaseConfigs; // transformed!
+
+ // tether constraints stuff
+ float mTetherConstraintLogStiffness;
+ float mTetherConstraintScale;
+
+ // motion constraints stuff
+ SwConstraints mMotionConstraints;
+ float mMotionConstraintScale;
+ float mMotionConstraintBias;
+ float mMotionConstraintLogStiffness;
+
+ // separation constraints stuff
+ SwConstraints mSeparationConstraints;
+
+ // particle acceleration stuff
+ Vec4fAlignedVector mParticleAccelerations;
+
+ // wind
+ PxVec3 mWind;
+ float mDragLogCoefficient;
+ float mLiftLogCoefficient;
+
+ // collision stuff
+ Vector<IndexPair>::Type mCapsuleIndices;
+ Vec4fAlignedVector mStartCollisionSpheres;
+ Vec4fAlignedVector mTargetCollisionSpheres;
+ Vector<uint32_t>::Type mConvexMasks;
+ Vec4fAlignedVector mStartCollisionPlanes;
+ Vec4fAlignedVector mTargetCollisionPlanes;
+ Vector<PxVec3>::Type mStartCollisionTriangles;
+ Vector<PxVec3>::Type mTargetCollisionTriangles;
+ bool mEnableContinuousCollision;
+ float mCollisionMassScale;
+ float mFriction;
+
+ // virtual particles
+ Vector<Vec4us>::Type mVirtualParticleIndices;
+ Vec4fAlignedVector mVirtualParticleWeights;
+ uint32_t mNumVirtualParticles;
+
+ // self collision
+ float mSelfCollisionDistance;
+ float mSelfCollisionLogStiffness;
+
+ Vector<uint32_t>::Type mSelfCollisionIndices;
+
+ Vec4fAlignedVector mRestPositions;
+
+ // sleeping
+ uint32_t mSleepTestInterval; // how often to test for movement
+ uint32_t mSleepAfterCount; // number of tests to pass before sleep
+ float mSleepThreshold; // max movement delta to pass test
+ uint32_t mSleepPassCounter; // how many tests passed
+ uint32_t mSleepTestCounter; // how many iterations since tested
+
+ void* mUserData;
+
+} PX_ALIGN_SUFFIX(16);
+
+} // namespace cloth
+
+// bounds = lower[3], upper[3]
+inline void cloth::SwCloth::setParticleBounds(const float* bounds)
+{
+ for(uint32_t i = 0; i < 3; ++i)
+ {
+ mParticleBoundsCenter[i] = (bounds[3 + i] + bounds[i]) * 0.5f;
+ mParticleBoundsHalfExtent[i] = (bounds[3 + i] - bounds[i]) * 0.5f;
+ }
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.cpp
new file mode 100644
index 00000000..ce44f8d0
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.cpp
@@ -0,0 +1,154 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwClothData.h"
+#include "SwCloth.h"
+#include "SwFabric.h"
+#include "PsUtilities.h"
+#include "PsMathUtils.h"
+#include "CmPhysXCommon.h"
+
+using namespace physx;
+
+cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric)
+{
+ mNumParticles = uint32_t(cloth.mCurParticles.size());
+ mCurParticles = array(cloth.mCurParticles.front());
+ mPrevParticles = array(cloth.mPrevParticles.front());
+
+ const float* center = array(cloth.mParticleBoundsCenter);
+ const float* extent = array(cloth.mParticleBoundsHalfExtent);
+ for(uint32_t i = 0; i < 3; ++i)
+ {
+ mCurBounds[i] = center[i] - extent[i];
+ mCurBounds[i + 3] = center[i] + extent[i];
+ }
+
+ // avoid reading uninitialized data into mCurBounds, even though it's never used.
+ mPrevBounds[0] = 0.0f;
+
+ mConfigBegin = cloth.mPhaseConfigs.empty() ? 0 : &cloth.mPhaseConfigs.front();
+ mConfigEnd = mConfigBegin + cloth.mPhaseConfigs.size();
+
+ mPhases = &fabric.mPhases.front();
+ mNumPhases = uint32_t(fabric.mPhases.size());
+
+ mSets = &fabric.mSets.front();
+ mNumSets = uint32_t(fabric.mSets.size());
+
+ mRestvalues = &fabric.mRestvalues.front();
+ mNumRestvalues = uint32_t(fabric.mRestvalues.size());
+
+ mIndices = &fabric.mIndices.front();
+ mNumIndices = uint32_t(fabric.mIndices.size());
+
+ float stiffnessExponent = cloth.mStiffnessFrequency * cloth.mPrevIterDt * 0.69314718055994531f; // logf(2.0f);
+
+ mTethers = fabric.mTethers.begin();
+ mNumTethers = uint32_t(fabric.mTethers.size());
+ mTetherConstraintStiffness = 1.0f - Ps::exp(stiffnessExponent * cloth.mTetherConstraintLogStiffness);
+ mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale;
+
+ mTriangles = fabric.mTriangles.begin();
+ mNumTriangles = uint32_t(fabric.mTriangles.size()) / 3;
+ mDragCoefficient = 1.0f - Ps::exp(stiffnessExponent * cloth.mDragLogCoefficient);
+ mLiftCoefficient = 1.0f - Ps::exp(stiffnessExponent * cloth.mLiftLogCoefficient);
+
+ mStartMotionConstraints = cloth.mMotionConstraints.mStart.size() ? array(cloth.mMotionConstraints.mStart.front()) : 0;
+ mTargetMotionConstraints =
+ !cloth.mMotionConstraints.mTarget.empty() ? array(cloth.mMotionConstraints.mTarget.front()) : 0;
+ mMotionConstraintStiffness = 1.0f - Ps::exp(stiffnessExponent * cloth.mMotionConstraintLogStiffness);
+
+ mStartSeparationConstraints =
+ cloth.mSeparationConstraints.mStart.size() ? array(cloth.mSeparationConstraints.mStart.front()) : 0;
+ mTargetSeparationConstraints =
+ !cloth.mSeparationConstraints.mTarget.empty() ? array(cloth.mSeparationConstraints.mTarget.front()) : 0;
+
+ mParticleAccelerations = cloth.mParticleAccelerations.size() ? array(cloth.mParticleAccelerations.front()) : 0;
+
+ mStartCollisionSpheres = cloth.mStartCollisionSpheres.empty() ? 0 : array(cloth.mStartCollisionSpheres.front());
+ mTargetCollisionSpheres =
+ cloth.mTargetCollisionSpheres.empty() ? mStartCollisionSpheres : array(cloth.mTargetCollisionSpheres.front());
+ mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size());
+
+ mCapsuleIndices = cloth.mCapsuleIndices.empty() ? 0 : &cloth.mCapsuleIndices.front();
+ mNumCapsules = uint32_t(cloth.mCapsuleIndices.size());
+
+ mStartCollisionPlanes = cloth.mStartCollisionPlanes.empty() ? 0 : array(cloth.mStartCollisionPlanes.front());
+ mTargetCollisionPlanes =
+ cloth.mTargetCollisionPlanes.empty() ? mStartCollisionPlanes : array(cloth.mTargetCollisionPlanes.front());
+ mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size());
+
+ mConvexMasks = cloth.mConvexMasks.empty() ? 0 : &cloth.mConvexMasks.front();
+ mNumConvexes = uint32_t(cloth.mConvexMasks.size());
+
+ mStartCollisionTriangles = cloth.mStartCollisionTriangles.empty() ? 0 : array(cloth.mStartCollisionTriangles.front());
+ mTargetCollisionTriangles = cloth.mTargetCollisionTriangles.empty() ? mStartCollisionTriangles
+ : array(cloth.mTargetCollisionTriangles.front());
+ mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3;
+
+ mVirtualParticlesBegin = cloth.mVirtualParticleIndices.empty() ? 0 : array(cloth.mVirtualParticleIndices.front());
+ mVirtualParticlesEnd = mVirtualParticlesBegin + 4 * cloth.mVirtualParticleIndices.size();
+ mVirtualParticleWeights = cloth.mVirtualParticleWeights.empty() ? 0 : array(cloth.mVirtualParticleWeights.front());
+ mNumVirtualParticleWeights = uint32_t(cloth.mVirtualParticleWeights.size());
+
+ mEnableContinuousCollision = cloth.mEnableContinuousCollision;
+ mCollisionMassScale = cloth.mCollisionMassScale;
+ mFrictionScale = cloth.mFriction;
+
+ mSelfCollisionDistance = cloth.mSelfCollisionDistance;
+ mSelfCollisionStiffness = 1.0f - Ps::exp(stiffnessExponent * cloth.mSelfCollisionLogStiffness);
+
+ mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin();
+ mNumSelfCollisionIndices = mSelfCollisionIndices ? cloth.mSelfCollisionIndices.size() : mNumParticles;
+
+ mRestPositions = cloth.mRestPositions.size() ? array(cloth.mRestPositions.front()) : 0;
+
+ mSleepPassCounter = cloth.mSleepPassCounter;
+ mSleepTestCounter = cloth.mSleepTestCounter;
+}
+
+void cloth::SwClothData::reconcile(SwCloth& cloth) const
+{
+ cloth.setParticleBounds(mCurBounds);
+ cloth.mSleepTestCounter = mSleepTestCounter;
+ cloth.mSleepPassCounter = mSleepPassCounter;
+}
+
+void cloth::SwClothData::verify() const
+{
+ // checks needs to be run after the constructor because
+ // data isn't immediately available on SPU at that stage
+ // perhaps a good reason to construct SwClothData on PPU instead
+
+ PX_ASSERT(!mNumCapsules ||
+ mNumSpheres > *shdfnd::maxElement(&mCapsuleIndices->first, &(mCapsuleIndices + mNumCapsules)->first));
+
+ PX_ASSERT(!mNumConvexes || (1u << mNumPlanes) - 1 >= *shdfnd::maxElement(mConvexMasks, mConvexMasks + mNumConvexes));
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.h b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.h
new file mode 100644
index 00000000..e3f503ca
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.h
@@ -0,0 +1,151 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/Px.h"
+#include "Types.h"
+
+namespace physx
+{
+namespace simd
+{
+}
+}
+
+namespace physx
+{
+namespace cloth
+{
+
+class SwCloth;
+class SwFabric;
+struct PhaseConfig;
+struct IndexPair;
+struct SwTether;
+
+// reference to cloth instance bulk data (POD)
+struct SwClothData
+{
+ SwClothData(SwCloth&, const SwFabric&);
+ void reconcile(SwCloth&) const;
+ void verify() const;
+
+ // particle data
+ uint32_t mNumParticles;
+ float* mCurParticles;
+ float* mPrevParticles;
+
+ float mCurBounds[6]; // lower[3], upper[3]
+ float mPrevBounds[6];
+ float mPadding; // write as simd
+
+ // distance constraints
+ const PhaseConfig* mConfigBegin;
+ const PhaseConfig* mConfigEnd;
+
+ const uint32_t* mPhases;
+ uint32_t mNumPhases;
+
+ const uint32_t* mSets;
+ uint32_t mNumSets;
+
+ const float* mRestvalues;
+ uint32_t mNumRestvalues;
+
+ const uint16_t* mIndices;
+ uint32_t mNumIndices;
+
+ const SwTether* mTethers;
+ uint32_t mNumTethers;
+ float mTetherConstraintStiffness;
+ float mTetherConstraintScale;
+
+ // wind data
+ const uint16_t* mTriangles;
+ uint32_t mNumTriangles;
+ float mDragCoefficient;
+ float mLiftCoefficient;
+
+ // motion constraint data
+ const float* mStartMotionConstraints;
+ const float* mTargetMotionConstraints;
+ float mMotionConstraintStiffness;
+
+ // separation constraint data
+ const float* mStartSeparationConstraints;
+ const float* mTargetSeparationConstraints;
+
+ // particle acceleration data
+ const float* mParticleAccelerations;
+
+ // collision stuff
+ const float* mStartCollisionSpheres;
+ const float* mTargetCollisionSpheres;
+ uint32_t mNumSpheres;
+
+ const IndexPair* mCapsuleIndices;
+ uint32_t mNumCapsules;
+
+ const float* mStartCollisionPlanes;
+ const float* mTargetCollisionPlanes;
+ uint32_t mNumPlanes;
+
+ const uint32_t* mConvexMasks;
+ uint32_t mNumConvexes;
+
+ const float* mStartCollisionTriangles;
+ const float* mTargetCollisionTriangles;
+ uint32_t mNumCollisionTriangles;
+
+ const uint16_t* mVirtualParticlesBegin;
+ const uint16_t* mVirtualParticlesEnd;
+
+ const float* mVirtualParticleWeights;
+ uint32_t mNumVirtualParticleWeights;
+
+ bool mEnableContinuousCollision;
+ float mFrictionScale;
+ float mCollisionMassScale;
+
+ float mSelfCollisionDistance;
+ float mSelfCollisionStiffness;
+
+ uint32_t mNumSelfCollisionIndices;
+ const uint32_t* mSelfCollisionIndices;
+
+ float* mRestPositions;
+
+ // sleep data
+ uint32_t mSleepPassCounter;
+ uint32_t mSleepTestCounter;
+
+} PX_ALIGN_SUFFIX(16);
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp
new file mode 100644
index 00000000..e505289f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp
@@ -0,0 +1,1935 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "foundation/PxAssert.h"
+#include "SwCollision.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "IterationState.h"
+#include "BoundingBox.h"
+#include "PointInterpolator.h"
+#include "SwCollisionHelpers.h"
+#include <cstring> // for memset
+
+using namespace physx;
+
+// the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision
+template <typename Simd4f>
+const Simd4f cloth::SwCollision<Simd4f>::sSkeletonWidth = simd4f(cloth::sqr(1 - 0.2f) - 1);
+
+#if NV_SIMD_SSE2
+const Simd4i cloth::Gather<Simd4i>::sIntSignBit = simd4i(0x80000000);
+const Simd4i cloth::Gather<Simd4i>::sSignedMask = sIntSignBit | simd4i(0x7);
+#elif NV_SIMD_NEON
+const Simd4i cloth::Gather<Simd4i>::sPack = simd4i(0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c);
+const Simd4i cloth::Gather<Simd4i>::sOffset = simd4i(0x03020100);
+const Simd4i cloth::Gather<Simd4i>::sShift = simd4i(2);
+const Simd4i cloth::Gather<Simd4i>::sMask = simd4i(7);
+#endif
+
+namespace
+{
+const Simd4fTupleFactory sMaskX = simd4f(simd4i(~0, 0, 0, 0));
+const Simd4fTupleFactory sMaskZ = simd4f(simd4i(0, 0, ~0, 0));
+const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fTupleFactory gSimd4fOneXYZ = simd4f(1.0f, 1.0f, 1.0f, 0.0f);
+const Simd4fScalarFactory sGridLength = simd4f(8 - 1e-3f); // sGridSize
+const Simd4fScalarFactory sGridExpand = simd4f(1e-4f);
+const Simd4fTupleFactory sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+#if PX_PROFILE || PX_DEBUG
+template <typename Simd4f>
+uint32_t horizontalSum(const Simd4f& x)
+{
+ const float* p = array(x);
+ return uint32_t(0.5f + p[0] + p[1] + p[2] + p[3]);
+}
+#endif
+
+// 7 elements are written to ptr!
+template <typename Simd4f>
+void storeBounds(float* ptr, const cloth::BoundingBox<Simd4f>& bounds)
+{
+ store(ptr, bounds.mLower);
+ store(ptr + 3, bounds.mUpper);
+}
+}
+
+struct cloth::SphereData
+{
+ PxVec3 center;
+ float radius;
+};
+
+struct cloth::ConeData
+{
+ PxVec3 center;
+ float radius; // cone radius at center
+ PxVec3 axis;
+ float slope; // tan(alpha)
+
+ float sqrCosine; // cos^2(alpha)
+ float halfLength;
+
+ uint32_t firstMask;
+ uint32_t bothMask;
+};
+
+struct cloth::TriangleData
+{
+ PxVec3 base;
+ float edge0DotEdge1;
+
+ PxVec3 edge0;
+ float edge0SqrLength;
+
+ PxVec3 edge1;
+ float edge1SqrLength;
+
+ PxVec3 normal;
+ float padding;
+
+ float det;
+ float denom;
+
+ float edge0InvSqrLength;
+ float edge1InvSqrLength;
+};
+
+namespace physx
+{
+namespace cloth
+{
+template <typename Simd4f>
+BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bbox, const SphereData* sIt, const SphereData* sEnd)
+{
+ BoundingBox<Simd4f> result = bbox;
+ for(; sIt != sEnd; ++sIt)
+ {
+ Simd4f p = loadAligned(array(sIt->center));
+ Simd4f r = splat<3>(p);
+ result.mLower = min(result.mLower, p - r);
+ result.mUpper = max(result.mUpper, p + r);
+ }
+ return result;
+}
+}
+}
+
+namespace
+{
+template <typename Simd4f, typename SrcIterator>
+void generateSpheres(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+ // have to copy out iterator to ensure alignment is maintained
+ for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+ *dIt = max(sMinusFloatMaxXYZ, *sIt); // clamp radius to 0
+}
+
+void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, const cloth::IndexPair* capsuleIndices,
+ uint32_t numCones)
+{
+ cloth::ConeData* cIt = dst;
+ for(const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt)
+ {
+ PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]);
+ PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]);
+
+ PxVec4 center = (second + first) * 0.5f;
+ PxVec4 axis = (second - first) * 0.5f;
+
+ float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
+ float sqrConeLength = sqrAxisLength - cloth::sqr(axis.w);
+
+ float invAxisLength = 1 / sqrtf(sqrAxisLength);
+ float invConeLength = 1 / sqrtf(sqrConeLength);
+
+ if(sqrConeLength <= 0.0f)
+ invAxisLength = invConeLength = 0.0f;
+
+ float axisLength = sqrAxisLength * invAxisLength;
+ float slope = axis.w * invConeLength;
+
+ cIt->center = PxVec3(center.x, center.y, center.z);
+ cIt->radius = (axis.w + first.w) * invConeLength * axisLength;
+ cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength;
+ cIt->slope = slope;
+
+ cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisLength);
+ cIt->halfLength = axisLength;
+
+ uint32_t firstMask = 0x1u << iIt->first;
+ cIt->firstMask = firstMask;
+ cIt->bothMask = firstMask | 0x1u << iIt->second;
+ }
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generatePlanes(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+ // have to copy out iterator to ensure alignment is maintained
+ for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+ *dIt = *sIt;
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generateTriangles(cloth::TriangleData* dIt, const SrcIterator& src, uint32_t count)
+{
+ // have to copy out iterator to ensure alignment is maintained
+ for(SrcIterator sIt = src; 0 < count--; ++dIt)
+ {
+ Simd4f p0 = *sIt;
+ ++sIt;
+ Simd4f p1 = *sIt;
+ ++sIt;
+ Simd4f p2 = *sIt;
+ ++sIt;
+
+ Simd4f edge0 = p1 - p0;
+ Simd4f edge1 = p2 - p0;
+ Simd4f normal = cross3(edge0, edge1);
+
+ Simd4f edge0SqrLength = dot3(edge0, edge0);
+ Simd4f edge1SqrLength = dot3(edge1, edge1);
+ Simd4f edge0DotEdge1 = dot3(edge0, edge1);
+ Simd4f normalInvLength = rsqrt(dot3(normal, normal));
+
+ Simd4f det = edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1;
+ Simd4f denom = edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1;
+
+ // there are definitely faster ways...
+ Simd4f aux = select(sMaskX, det, denom);
+ aux = select(sMaskZ, edge0SqrLength, aux);
+ aux = select(sMaskW, edge1SqrLength, aux);
+
+ storeAligned(&dIt->base.x, select(sMaskW, edge0DotEdge1, p0));
+ storeAligned(&dIt->edge0.x, select(sMaskW, edge0SqrLength, edge0));
+ storeAligned(&dIt->edge1.x, select(sMaskW, edge1SqrLength, edge1));
+ storeAligned(&dIt->normal.x, normal * normalInvLength);
+ storeAligned(&dIt->det, recip<1>(aux));
+ }
+}
+
+} // namespace
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::CollisionData::CollisionData()
+: mSpheres(0), mCones(0)
+{
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::SwCollision(SwClothData& clothData, SwKernelAllocator& alloc)
+: mClothData(clothData), mAllocator(alloc)
+{
+ allocate(mCurData);
+
+ if(mClothData.mEnableContinuousCollision || mClothData.mFrictionScale > 0.0f)
+ {
+ allocate(mPrevData);
+
+ generateSpheres(reinterpret_cast<Simd4f*>(mPrevData.mSpheres),
+ reinterpret_cast<const Simd4f*>(clothData.mStartCollisionSpheres), clothData.mNumSpheres);
+
+ generateCones(mPrevData.mCones, mPrevData.mSpheres, clothData.mCapsuleIndices, clothData.mNumCapsules);
+ }
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::~SwCollision()
+{
+ deallocate(mCurData);
+ deallocate(mPrevData);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::operator()(const IterationState<Simd4f>& state)
+{
+ mNumCollisions = 0;
+
+ collideConvexes(state); // discrete convex collision, no friction
+ collideTriangles(state); // discrete triangle collision, no friction
+
+ computeBounds();
+
+ if(!mClothData.mNumSpheres)
+ return;
+
+ bool lastIteration = state.mRemainingIterations == 1;
+
+ const Simd4f* targetSpheres = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionSpheres);
+
+ // generate sphere and cone collision data
+ if(!lastIteration)
+ {
+ // interpolate spheres
+ LerpIterator<Simd4f, const Simd4f*> pIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionSpheres),
+ targetSpheres, state.getCurrentAlpha());
+ generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), pIter, mClothData.mNumSpheres);
+ }
+ else
+ {
+ // otherwise use the target spheres directly
+ generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), targetSpheres, mClothData.mNumSpheres);
+ }
+
+ // generate cones even if test below fails because
+ // continuous collision might need it in next iteration
+ generateCones(mCurData.mCones, mCurData.mSpheres, mClothData.mCapsuleIndices, mClothData.mNumCapsules);
+
+ if(buildAcceleration())
+ {
+ if(mClothData.mEnableContinuousCollision)
+ collideContinuousParticles();
+
+ mergeAcceleration(reinterpret_cast<uint32_t*>(mSphereGrid));
+ mergeAcceleration(reinterpret_cast<uint32_t*>(mConeGrid));
+
+ if(!mClothData.mEnableContinuousCollision)
+ collideParticles();
+
+ collideVirtualParticles();
+ }
+
+ if(mPrevData.mSpheres)
+ shdfnd::swap(mCurData, mPrevData);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+ size_t numTriangles = cloth.mStartCollisionTriangles.size();
+ size_t numPlanes = cloth.mStartCollisionPlanes.size();
+
+ const size_t kTriangleDataSize = sizeof(TriangleData) * numTriangles;
+ const size_t kPlaneDataSize = sizeof(PxVec4) * numPlanes * 2;
+
+ return PxMax(kTriangleDataSize, kPlaneDataSize);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimatePersistentMemory(const SwCloth& cloth)
+{
+ size_t numCapsules = cloth.mCapsuleIndices.size();
+ size_t numSpheres = cloth.mStartCollisionSpheres.size();
+
+ size_t sphereDataSize = sizeof(SphereData) * numSpheres * 2;
+ size_t coneDataSize = sizeof(ConeData) * numCapsules * 2;
+
+ return sphereDataSize + coneDataSize;
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::allocate(CollisionData& data)
+{
+ data.mSpheres = static_cast<SphereData*>(mAllocator.allocate(sizeof(SphereData) * mClothData.mNumSpheres));
+
+ data.mCones = static_cast<ConeData*>(mAllocator.allocate(sizeof(ConeData) * mClothData.mNumCapsules));
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::deallocate(const CollisionData& data)
+{
+ mAllocator.deallocate(data.mSpheres);
+ mAllocator.deallocate(data.mCones);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::computeBounds()
+{
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::computeBounds", 0);
+
+ Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+ Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ Simd4f* curEnd = curIt + mClothData.mNumParticles;
+ Simd4f floatMaxXYZ = -static_cast<Simd4f>(sMinusFloatMaxXYZ);
+
+ Simd4f lower = simd4f(FLT_MAX), upper = -lower;
+ for(; curIt < curEnd; ++curIt, ++prevIt)
+ {
+ Simd4f current = *curIt;
+ lower = min(lower, current);
+ upper = max(upper, current);
+ // if(current.w > 0) current.w = previous.w
+ *curIt = select(current > floatMaxXYZ, *prevIt, current);
+ }
+
+ BoundingBox<Simd4f> curBounds;
+ curBounds.mLower = lower;
+ curBounds.mUpper = upper;
+
+ // don't change this order, storeBounds writes 7 floats
+ BoundingBox<Simd4f> prevBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+ storeBounds(mClothData.mCurBounds, curBounds);
+ storeBounds(mClothData.mPrevBounds, prevBounds);
+}
+
+namespace
+{
+template <typename Simd4i>
+Simd4i andNotIsZero(const Simd4i& left, const Simd4i& right)
+{
+ return (left & ~right) == gSimd4iZero;
+}
+}
+
+// build per-axis mask arrays of spheres on the right/left of grid cell
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildSphereAcceleration(const SphereData* sIt)
+{
+ static const int maxIndex = sGridSize - 1;
+
+ const SphereData* sEnd = sIt + mClothData.mNumSpheres;
+ for(uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1)
+ {
+ Simd4f sphere = loadAligned(array(sIt->center));
+ Simd4f radius = splat<3>(sphere);
+
+ Simd4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero));
+ Simd4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
+
+ const int* firstIdx = array(first);
+ const int* lastIdx = array(last);
+
+ uint32_t* firstIt = reinterpret_cast<uint32_t*>(mSphereGrid);
+ uint32_t* lastIt = firstIt + 3 * sGridSize;
+
+ for(uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize)
+ {
+ for(int j = firstIdx[i]; j <= maxIndex; ++j)
+ firstIt[j] |= mask;
+
+ for(int j = lastIdx[i]; j >= 0; --j)
+ lastIt[j] |= mask;
+ }
+ }
+}
+
+// generate cone masks from sphere masks
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildConeAcceleration()
+{
+ const ConeData* coneIt = mCurData.mCones;
+ const ConeData* coneEnd = coneIt + mClothData.mNumCapsules;
+ for(uint32_t coneMask = 0x1; coneIt != coneEnd; ++coneIt, coneMask <<= 1)
+ {
+ if(coneIt->radius == 0.0f)
+ continue;
+
+ uint32_t spheresMask = coneIt->bothMask;
+
+ uint32_t* sphereIt = reinterpret_cast<uint32_t*>(mSphereGrid);
+ uint32_t* sphereEnd = sphereIt + 6 * sGridSize;
+ uint32_t* gridIt = reinterpret_cast<uint32_t*>(mConeGrid);
+ for(; sphereIt != sphereEnd; ++sphereIt, ++gridIt)
+ if(*sphereIt & spheresMask)
+ *gridIt |= coneMask;
+ }
+}
+
+// convert right/left mask arrays into single overlap array
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::mergeAcceleration(uint32_t* firstIt)
+{
+ uint32_t* firstEnd = firstIt + 3 * sGridSize;
+ uint32_t* lastIt = firstEnd;
+ for(; firstIt != firstEnd; ++firstIt, ++lastIt)
+ *firstIt &= *lastIt;
+}
+
+// build mask of spheres/cones touching a regular grid along each axis
+template <typename Simd4f>
+bool cloth::SwCollision<Simd4f>::buildAcceleration()
+{
+ // determine sphere bbox
+ BoundingBox<Simd4f> sphereBounds =
+ expandBounds(emptyBounds<Simd4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres);
+ BoundingBox<Simd4f> particleBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+ if(mClothData.mEnableContinuousCollision)
+ {
+ sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres);
+ particleBounds = expandBounds(particleBounds, loadBounds<Simd4f>(mClothData.mPrevBounds));
+ }
+
+ BoundingBox<Simd4f> bounds = intersectBounds(sphereBounds, particleBounds);
+ Simd4f edgeLength = (bounds.mUpper - bounds.mLower) & ~static_cast<Simd4f>(sMaskW);
+ if(!allGreaterEqual(edgeLength, gSimd4fZero))
+ return false;
+
+ // calculate an expanded bounds to account for numerical inaccuracy
+ const Simd4f expandedLower = bounds.mLower - abs(bounds.mLower) * sGridExpand;
+ const Simd4f expandedUpper = bounds.mUpper + abs(bounds.mUpper) * sGridExpand;
+ const Simd4f expandedEdgeLength = max(expandedUpper - expandedLower, gSimd4fEpsilon);
+
+ // make grid minimal thickness and strict upper bound of spheres
+ mGridScale = sGridLength * recip<1>(expandedEdgeLength);
+ mGridBias = -expandedLower * mGridScale;
+ array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles()
+
+ PX_ASSERT(allTrue(((bounds.mLower * mGridScale + mGridBias) >= simd4f(0.0f)) | sMaskW));
+ PX_ASSERT(allTrue(((bounds.mUpper * mGridScale + mGridBias) < simd4f(8.0f)) | sMaskW));
+
+ memset(mSphereGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+ if(mClothData.mEnableContinuousCollision)
+ buildSphereAcceleration(mPrevData.mSpheres);
+ buildSphereAcceleration(mCurData.mSpheres);
+
+ memset(mConeGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+ buildConeAcceleration();
+
+ return true;
+}
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else
+#define FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator=(const ShapeMask& right)
+{
+ mCones = right.mCones;
+ mSpheres = right.mSpheres;
+ return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator&=(const ShapeMask& right)
+{
+ mCones = mCones & right.mCones;
+ mSpheres = mSpheres & right.mSpheres;
+ return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f& position, const Simd4i* __restrict sphereGrid,
+ const Simd4i* __restrict coneGrid)
+{
+ Gather<Simd4i> gather(intFloor(position));
+
+ ShapeMask result;
+ result.mCones = gather(coneGrid);
+ result.mSpheres = gather(sphereGrid);
+ return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict positions) const
+{
+ Simd4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias);
+ Simd4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias);
+ Simd4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias);
+
+ ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid);
+ result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2);
+ result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4);
+
+ return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict prevPos, const Simd4f* __restrict curPos) const
+{
+ Simd4f scaleX = splat<0>(mGridScale);
+ Simd4f scaleY = splat<1>(mGridScale);
+ Simd4f scaleZ = splat<2>(mGridScale);
+
+ Simd4f biasX = splat<0>(mGridBias);
+ Simd4f biasY = splat<1>(mGridBias);
+ Simd4f biasZ = splat<2>(mGridBias);
+
+ Simd4f prevX = prevPos[0] * scaleX + biasX;
+ Simd4f prevY = prevPos[1] * scaleY + biasY;
+ Simd4f prevZ = prevPos[2] * scaleZ + biasZ;
+
+ Simd4f curX = curPos[0] * scaleX + biasX;
+ Simd4f curY = curPos[1] * scaleY + biasY;
+ Simd4f curZ = curPos[2] * scaleZ + biasZ;
+
+ Simd4f maxX = min(max(prevX, curX), sGridLength);
+ Simd4f maxY = min(max(prevY, curY), sGridLength);
+ Simd4f maxZ = min(max(prevZ, curZ), sGridLength);
+
+ ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid);
+ result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2);
+ result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4);
+
+ Simd4f zero = gSimd4fZero;
+ Simd4f minX = max(min(prevX, curX), zero);
+ Simd4f minY = max(min(prevY, curY), zero);
+ Simd4f minZ = max(min(prevZ, curZ), zero);
+
+ result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6);
+ result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8);
+ result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10);
+
+ return result;
+}
+
+template <typename Simd4f>
+struct cloth::SwCollision<Simd4f>::ImpulseAccumulator
+{
+ ImpulseAccumulator()
+ : mDeltaX(gSimd4fZero)
+ , mDeltaY(mDeltaX)
+ , mDeltaZ(mDeltaX)
+ , mVelX(mDeltaX)
+ , mVelY(mDeltaX)
+ , mVelZ(mDeltaX)
+ , mNumCollisions(gSimd4fEpsilon)
+ {
+ }
+
+ void add(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+ {
+ PX_ASSERT(allTrue((mask & x) == (mask & x)));
+ PX_ASSERT(allTrue((mask & y) == (mask & y)));
+ PX_ASSERT(allTrue((mask & z) == (mask & z)));
+ PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+ Simd4f maskedScale = scale & mask;
+ mDeltaX = mDeltaX + x * maskedScale;
+ mDeltaY = mDeltaY + y * maskedScale;
+ mDeltaZ = mDeltaZ + z * maskedScale;
+ mNumCollisions = mNumCollisions + (gSimd4fOne & mask);
+ }
+
+ void addVelocity(const Simd4f& vx, const Simd4f& vy, const Simd4f& vz, const Simd4f& mask)
+ {
+ PX_ASSERT(allTrue((mask & vx) == (mask & vx)));
+ PX_ASSERT(allTrue((mask & vy) == (mask & vy)));
+ PX_ASSERT(allTrue((mask & vz) == (mask & vz)));
+
+ mVelX = mVelX + (vx & mask);
+ mVelY = mVelY + (vy & mask);
+ mVelZ = mVelZ + (vz & mask);
+ }
+
+ void subtract(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+ {
+ PX_ASSERT(allTrue((mask & x) == (mask & x)));
+ PX_ASSERT(allTrue((mask & y) == (mask & y)));
+ PX_ASSERT(allTrue((mask & z) == (mask & z)));
+ PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+ Simd4f maskedScale = scale & mask;
+ mDeltaX = mDeltaX - x * maskedScale;
+ mDeltaY = mDeltaY - y * maskedScale;
+ mDeltaZ = mDeltaZ - z * maskedScale;
+ mNumCollisions = mNumCollisions + (gSimd4fOne & mask);
+ }
+
+ Simd4f mDeltaX, mDeltaY, mDeltaZ;
+ Simd4f mVelX, mVelY, mVelZ;
+ Simd4f mNumCollisions;
+};
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* positions,
+ ImpulseAccumulator& accum) const
+{
+ const float* __restrict spherePtr = array(mCurData.mSpheres->center);
+
+ bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+ Simd4i mask4 = horizontalOr(sphereMask);
+ uint32_t mask = uint32_t(array(mask4)[0]);
+ while(mask)
+ {
+ uint32_t test = mask - 1;
+ uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+ mask = mask & test;
+
+ Simd4f sphere = loadAligned(spherePtr, offset);
+
+ Simd4f deltaX = positions[0] - splat<0>(sphere);
+ Simd4f deltaY = positions[1] - splat<1>(sphere);
+ Simd4f deltaZ = positions[2] - splat<2>(sphere);
+
+ Simd4f sqrDistance = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+ Simd4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * splat<3>(sphere);
+
+ Simd4f contactMask;
+ if(!anyGreater(gSimd4fZero, negativeScale, contactMask))
+ continue;
+
+ accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask);
+
+ if(frictionEnabled)
+ {
+ // load previous sphere pos
+ const float* __restrict prevSpherePtr = array(mPrevData.mSpheres->center);
+
+ Simd4f prevSphere = loadAligned(prevSpherePtr, offset);
+ Simd4f velocity = sphere - prevSphere;
+
+ accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+ }
+ }
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict positions, ImpulseAccumulator& accum) const
+{
+ const float* __restrict centerPtr = array(mCurData.mCones->center);
+ const float* __restrict axisPtr = array(mCurData.mCones->axis);
+ const int32_t* __restrict auxiliaryPtr = reinterpret_cast<const int32_t*>(&mCurData.mCones->sqrCosine);
+
+ bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+ ShapeMask shapeMask = getShapeMask(positions);
+ Simd4i mask4 = horizontalOr(shapeMask.mCones);
+ uint32_t mask = uint32_t(array(mask4)[0]);
+ while(mask)
+ {
+ uint32_t test = mask - 1;
+ uint32_t coneIndex = findBitSet(mask & ~test);
+ uint32_t offset = coneIndex * sizeof(ConeData);
+ mask = mask & test;
+
+ Simd4i test4 = mask4 - gSimd4iOne;
+ Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+ mask4 = mask4 & test4;
+
+ Simd4f center = loadAligned(centerPtr, offset);
+
+ Simd4f deltaX = positions[0] - splat<0>(center);
+ Simd4f deltaY = positions[1] - splat<1>(center);
+ Simd4f deltaZ = positions[2] - splat<2>(center);
+
+ Simd4f axis = loadAligned(axisPtr, offset);
+
+ Simd4f axisX = splat<0>(axis);
+ Simd4f axisY = splat<1>(axis);
+ Simd4f axisZ = splat<2>(axis);
+ Simd4f slope = splat<3>(axis);
+
+ Simd4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+ Simd4f radius = dot * slope + splat<3>(center);
+
+ // set radius to zero if cone is culled
+ radius = max(radius, gSimd4fZero) & ~culled;
+
+ Simd4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
+
+ Simd4i auxiliary = loadAligned(auxiliaryPtr, offset);
+ Simd4i bothMask = splat<3>(auxiliary);
+
+ Simd4f contactMask;
+ if(!anyGreater(radius * radius, sqrDistance, contactMask))
+ {
+ // cone only culled when spheres culled, ok to clear those too
+ shapeMask.mSpheres = shapeMask.mSpheres & ~bothMask;
+ continue;
+ }
+
+ // clamp to a small positive epsilon to avoid numerical error
+ // making sqrDistance negative when point lies on the cone axis
+ sqrDistance = max(sqrDistance, gSimd4fEpsilon);
+
+ Simd4f invDistance = rsqrt(sqrDistance);
+ Simd4f base = dot + slope * sqrDistance * invDistance;
+
+ // force left/rightMask to false if not inside cone
+ base = base & contactMask;
+
+ Simd4f halfLength = splat<1>(simd4f(auxiliary));
+ Simd4i leftMask = simd4i(base < -halfLength);
+ Simd4i rightMask = simd4i(base > halfLength);
+
+ // we use both mask because of the early out above.
+ Simd4i firstMask = splat<2>(auxiliary);
+ Simd4i secondMask = firstMask ^ bothMask;
+ shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask);
+ shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask);
+
+ deltaX = deltaX - base * axisX;
+ deltaY = deltaY - base * axisY;
+ deltaZ = deltaZ - base * axisZ;
+
+ Simd4f sqrCosine = splat<0>(simd4f(auxiliary));
+ Simd4f scale = radius * invDistance * sqrCosine - sqrCosine;
+
+ contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+ if(!anyTrue(contactMask))
+ continue;
+
+ accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+ if(frictionEnabled)
+ {
+ uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+ uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+ float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+ float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+ // todo: could pre-compute sphere velocities or it might be
+ // faster to compute cur/prev sphere positions directly
+ Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+ Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+ Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+ Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+ Simd4f v0 = s0p1 - s0p0;
+ Simd4f v1 = s1p1 - s1p0;
+ Simd4f vd = v1 - v0;
+
+ // dot is in the range -1 to 1, scale and bias to 0 to 1
+ dot = dot * gSimd4fHalf + gSimd4fHalf;
+
+ // interpolate velocity at contact points
+ Simd4f vx = splat<0>(v0) + dot * splat<0>(vd);
+ Simd4f vy = splat<1>(v0) + dot * splat<1>(vd);
+ Simd4f vz = splat<2>(v0) + dot * splat<2>(vd);
+
+ accum.addVelocity(vx, vy, vz, contactMask);
+ }
+ }
+
+ return shapeMask.mSpheres;
+}
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* __restrict prevPos,
+ Simd4f* __restrict curPos, ImpulseAccumulator& accum) const
+{
+ const float* __restrict prevSpheres = array(mPrevData.mSpheres->center);
+ const float* __restrict curSpheres = array(mCurData.mSpheres->center);
+
+ bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+ Simd4i mask4 = horizontalOr(sphereMask);
+ uint32_t mask = uint32_t(array(mask4)[0]);
+ while(mask)
+ {
+ uint32_t test = mask - 1;
+ uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+ mask = mask & test;
+
+ Simd4f prevSphere = loadAligned(prevSpheres, offset);
+ Simd4f prevX = prevPos[0] - splat<0>(prevSphere);
+ Simd4f prevY = prevPos[1] - splat<1>(prevSphere);
+ Simd4f prevZ = prevPos[2] - splat<2>(prevSphere);
+ Simd4f prevRadius = splat<3>(prevSphere);
+
+ Simd4f curSphere = loadAligned(curSpheres, offset);
+ Simd4f curX = curPos[0] - splat<0>(curSphere);
+ Simd4f curY = curPos[1] - splat<1>(curSphere);
+ Simd4f curZ = curPos[2] - splat<2>(curSphere);
+ Simd4f curRadius = splat<3>(curSphere);
+
+ Simd4f sqrDistance = gSimd4fEpsilon + curX * curX + curY * curY + curZ * curZ;
+
+ Simd4f dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius;
+ Simd4f dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius;
+ Simd4f dotCurCur = sqrDistance - curRadius * curRadius;
+
+ Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+ Simd4f sqrtD = sqrt(discriminant);
+ Simd4f halfB = dotPrevCur - dotPrevPrev;
+ Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+ // time of impact or 0 if prevPos inside sphere
+ Simd4f toi = recip(minusA) * min(gSimd4fZero, halfB + sqrtD);
+ Simd4f collisionMask = (toi < gSimd4fOne) & (halfB < sqrtD);
+
+ // skip continuous collision if the (un-clamped) particle
+ // trajectory only touches the outer skin of the cone.
+ Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+ collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+ // a is negative when one sphere is contained in the other,
+ // which is already handled by discrete collision.
+ collisionMask = collisionMask & (minusA < -static_cast<Simd4f>(gSimd4fEpsilon));
+
+ if(!allEqual(collisionMask, gSimd4fZero))
+ {
+ Simd4f deltaX = prevX - curX;
+ Simd4f deltaY = prevY - curY;
+ Simd4f deltaZ = prevZ - curZ;
+
+ Simd4f oneMinusToi = (gSimd4fOne - toi) & collisionMask;
+
+ // reduce ccd impulse if (clamped) particle trajectory stays in sphere skin,
+ // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+ Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
+ oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
+
+ curX = curX + deltaX * oneMinusToi;
+ curY = curY + deltaY * oneMinusToi;
+ curZ = curZ + deltaZ * oneMinusToi;
+
+ curPos[0] = splat<0>(curSphere) + curX;
+ curPos[1] = splat<1>(curSphere) + curY;
+ curPos[2] = splat<2>(curSphere) + curZ;
+
+ sqrDistance = gSimd4fEpsilon + curX * curX + curY * curY + curZ * curZ;
+ }
+
+ Simd4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * curRadius;
+
+ Simd4f contactMask;
+ if(!anyGreater(gSimd4fZero, negativeScale, contactMask))
+ continue;
+
+ accum.subtract(curX, curY, curZ, negativeScale, contactMask);
+
+ if(frictionEnabled)
+ {
+ Simd4f velocity = curSphere - prevSphere;
+ accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+ }
+ }
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict prevPos, Simd4f* __restrict curPos,
+ ImpulseAccumulator& accum) const
+{
+ const float* __restrict prevCenterPtr = array(mPrevData.mCones->center);
+ const float* __restrict prevAxisPtr = array(mPrevData.mCones->axis);
+ const int32_t* __restrict prevAuxiliaryPtr = reinterpret_cast<const int32_t*>(&mPrevData.mCones->sqrCosine);
+
+ const float* __restrict curCenterPtr = array(mCurData.mCones->center);
+ const float* __restrict curAxisPtr = array(mCurData.mCones->axis);
+ const int32_t* __restrict curAuxiliaryPtr = reinterpret_cast<const int32_t*>(&mCurData.mCones->sqrCosine);
+
+ bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+ ShapeMask shapeMask = getShapeMask(prevPos, curPos);
+ Simd4i mask4 = horizontalOr(shapeMask.mCones);
+ uint32_t mask = uint32_t(array(mask4)[0]);
+ while(mask)
+ {
+ uint32_t test = mask - 1;
+ uint32_t coneIndex = findBitSet(mask & ~test);
+ uint32_t offset = coneIndex * sizeof(ConeData);
+ mask = mask & test;
+
+ Simd4i test4 = mask4 - gSimd4iOne;
+ Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+ mask4 = mask4 & test4;
+
+ Simd4f prevCenter = loadAligned(prevCenterPtr, offset);
+ Simd4f prevAxis = loadAligned(prevAxisPtr, offset);
+ Simd4f prevAxisX = splat<0>(prevAxis);
+ Simd4f prevAxisY = splat<1>(prevAxis);
+ Simd4f prevAxisZ = splat<2>(prevAxis);
+ Simd4f prevSlope = splat<3>(prevAxis);
+
+ Simd4f prevX = prevPos[0] - splat<0>(prevCenter);
+ Simd4f prevY = prevPos[1] - splat<1>(prevCenter);
+ Simd4f prevZ = prevPos[2] - splat<2>(prevCenter);
+ Simd4f prevT = prevY * prevAxisZ - prevZ * prevAxisY;
+ Simd4f prevU = prevZ * prevAxisX - prevX * prevAxisZ;
+ Simd4f prevV = prevX * prevAxisY - prevY * prevAxisX;
+ Simd4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+ Simd4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter);
+
+ Simd4f curCenter = loadAligned(curCenterPtr, offset);
+ Simd4f curAxis = loadAligned(curAxisPtr, offset);
+ Simd4f curAxisX = splat<0>(curAxis);
+ Simd4f curAxisY = splat<1>(curAxis);
+ Simd4f curAxisZ = splat<2>(curAxis);
+ Simd4f curSlope = splat<3>(curAxis);
+ Simd4i curAuxiliary = loadAligned(curAuxiliaryPtr, offset);
+
+ Simd4f curX = curPos[0] - splat<0>(curCenter);
+ Simd4f curY = curPos[1] - splat<1>(curCenter);
+ Simd4f curZ = curPos[2] - splat<2>(curCenter);
+ Simd4f curT = curY * curAxisZ - curZ * curAxisY;
+ Simd4f curU = curZ * curAxisX - curX * curAxisZ;
+ Simd4f curV = curX * curAxisY - curY * curAxisX;
+ Simd4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+ Simd4f curRadius = curDot * curSlope + splat<3>(curCenter);
+
+ Simd4f curSqrDistance = gSimd4fEpsilon + curT * curT + curU * curU + curV * curV;
+
+ // set radius to zero if cone is culled
+ prevRadius = max(prevRadius, gSimd4fZero) & ~culled;
+ curRadius = max(curRadius, gSimd4fZero) & ~culled;
+
+ Simd4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
+ Simd4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
+ Simd4f dotCurCur = curSqrDistance - curRadius * curRadius;
+
+ Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+ Simd4f sqrtD = sqrt(discriminant);
+ Simd4f halfB = dotPrevCur - dotPrevPrev;
+ Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+ // time of impact or 0 if prevPos inside cone
+ Simd4f toi = recip(minusA) * min(gSimd4fZero, halfB + sqrtD);
+ Simd4f collisionMask = (toi < gSimd4fOne) & (halfB < sqrtD);
+
+ // skip continuous collision if the (un-clamped) particle
+ // trajectory only touches the outer skin of the cone.
+ Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+ collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+ // a is negative when one cone is contained in the other,
+ // which is already handled by discrete collision.
+ collisionMask = collisionMask & (minusA < -static_cast<Simd4f>(gSimd4fEpsilon));
+
+ // test if any particle hits infinite cone (and 0<time of impact<1)
+ if(!allEqual(collisionMask, gSimd4fZero))
+ {
+ Simd4f deltaX = prevX - curX;
+ Simd4f deltaY = prevY - curY;
+ Simd4f deltaZ = prevZ - curZ;
+
+ // interpolate delta at toi
+ Simd4f posX = prevX - deltaX * toi;
+ Simd4f posY = prevY - deltaY * toi;
+ Simd4f posZ = prevZ - deltaZ * toi;
+
+ Simd4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary));
+ Simd4i prevAuxiliary = loadAligned(prevAuxiliaryPtr, offset);
+ Simd4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary));
+
+ Simd4f oneMinusToi = gSimd4fOne - toi;
+
+ // interpolate axis at toi
+ Simd4f axisX = splat<0>(curScaledAxis) - splat<0>(deltaScaledAxis) * oneMinusToi;
+ Simd4f axisY = splat<1>(curScaledAxis) - splat<1>(deltaScaledAxis) * oneMinusToi;
+ Simd4f axisZ = splat<2>(curScaledAxis) - splat<2>(deltaScaledAxis) * oneMinusToi;
+ Simd4f slope = (prevSlope * oneMinusToi + curSlope * toi);
+
+ Simd4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+ Simd4f invHalfLength = rsqrt(sqrHalfLength);
+ Simd4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
+
+ Simd4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
+ Simd4f invDistance = rsqrt(sqrDistance) & (sqrDistance > gSimd4fZero);
+
+ Simd4f base = dot + slope * sqrDistance * invDistance;
+ Simd4f scale = base * invHalfLength & collisionMask;
+
+ Simd4f cullMask = (abs(scale) < gSimd4fOne) & collisionMask;
+
+ // test if any impact position is in cone section
+ if(!allEqual(cullMask, gSimd4fZero))
+ {
+ deltaX = deltaX + splat<0>(deltaScaledAxis) * scale;
+ deltaY = deltaY + splat<1>(deltaScaledAxis) * scale;
+ deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale;
+
+ oneMinusToi = oneMinusToi & cullMask;
+
+ // reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+ // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+ // oneMinusToi = oneMinusToi * recip(gSimd4fOne - sqrtD * recip(minusA * oneMinusToi));
+ Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
+ oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
+
+ curX = curX + deltaX * oneMinusToi;
+ curY = curY + deltaY * oneMinusToi;
+ curZ = curZ + deltaZ * oneMinusToi;
+
+ curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+ curRadius = curDot * curSlope + splat<3>(curCenter);
+ curRadius = max(curRadius, gSimd4fZero) & ~culled;
+ curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
+
+ curPos[0] = splat<0>(curCenter) + curX;
+ curPos[1] = splat<1>(curCenter) + curY;
+ curPos[2] = splat<2>(curCenter) + curZ;
+ }
+ }
+
+ // curPos inside cone (discrete collision)
+ Simd4f contactMask;
+ int anyContact = anyGreater(curRadius * curRadius, curSqrDistance, contactMask);
+
+ Simd4i bothMask = splat<3>(curAuxiliary);
+
+ // instead of culling continuous collision for ~collisionMask, and discrete
+ // collision for ~contactMask, disable both if ~collisionMask & ~contactMask
+ Simd4i cullMask = bothMask & ~simd4i(collisionMask | contactMask);
+ shapeMask.mSpheres = shapeMask.mSpheres & ~cullMask;
+
+ if(!anyContact)
+ continue;
+
+ Simd4f invDistance = rsqrt(curSqrDistance) & (curSqrDistance > gSimd4fZero);
+ Simd4f base = curDot + curSlope * curSqrDistance * invDistance;
+
+ Simd4f halfLength = splat<1>(simd4f(curAuxiliary));
+ Simd4i leftMask = simd4i(base < -halfLength);
+ Simd4i rightMask = simd4i(base > halfLength);
+
+ // can only skip continuous sphere collision if post-ccd position
+ // is on code side *and* particle had cone-ccd collision.
+ Simd4i firstMask = splat<2>(curAuxiliary);
+ Simd4i secondMask = firstMask ^ bothMask;
+ cullMask = (firstMask & ~leftMask) | (secondMask & ~rightMask);
+ shapeMask.mSpheres = shapeMask.mSpheres & ~(cullMask & simd4i(collisionMask));
+
+ Simd4f deltaX = curX - base * curAxisX;
+ Simd4f deltaY = curY - base * curAxisY;
+ Simd4f deltaZ = curZ - base * curAxisZ;
+
+ Simd4f sqrCosine = splat<0>(simd4f(curAuxiliary));
+ Simd4f scale = curRadius * invDistance * sqrCosine - sqrCosine;
+
+ contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+ if(!anyTrue(contactMask))
+ continue;
+
+ accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+ if(frictionEnabled)
+ {
+ uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+ uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+ float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+ float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+ // todo: could pre-compute sphere velocities or it might be
+ // faster to compute cur/prev sphere positions directly
+ Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+ Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+ Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+ Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+ Simd4f v0 = s0p1 - s0p0;
+ Simd4f v1 = s1p1 - s1p0;
+ Simd4f vd = v1 - v0;
+
+ // dot is in the range -1 to 1, scale and bias to 0 to 1
+ curDot = curDot * gSimd4fHalf + gSimd4fHalf;
+
+ // interpolate velocity at contact points
+ Simd4f vx = splat<0>(v0) + curDot * splat<0>(vd);
+ Simd4f vy = splat<1>(v0) + curDot * splat<1>(vd);
+ Simd4f vz = splat<2>(v0) + curDot * splat<2>(vd);
+
+ accum.addVelocity(vx, vy, vz, contactMask);
+ }
+ }
+
+ return shapeMask.mSpheres;
+}
+
+namespace
+{
+
+template <typename Simd4f>
+PX_INLINE void calculateFrictionImpulse(const Simd4f& deltaX, const Simd4f& deltaY, const Simd4f& deltaZ,
+ const Simd4f& velX, const Simd4f& velY, const Simd4f& velZ,
+ const Simd4f* curPos, const Simd4f* prevPos, const Simd4f& scale,
+ const Simd4f& coefficient, const Simd4f& mask, Simd4f* impulse)
+{
+ // calculate collision normal
+ Simd4f deltaSq = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+ Simd4f rcpDelta = rsqrt(deltaSq + gSimd4fEpsilon);
+
+ Simd4f nx = deltaX * rcpDelta;
+ Simd4f ny = deltaY * rcpDelta;
+ Simd4f nz = deltaZ * rcpDelta;
+
+ // calculate relative velocity scaled by number of collisions
+ Simd4f rvx = curPos[0] - prevPos[0] - velX * scale;
+ Simd4f rvy = curPos[1] - prevPos[1] - velY * scale;
+ Simd4f rvz = curPos[2] - prevPos[2] - velZ * scale;
+
+ // calculate magnitude of relative normal velocity
+ Simd4f rvn = rvx * nx + rvy * ny + rvz * nz;
+
+ // calculate relative tangential velocity
+ Simd4f rvtx = rvx - rvn * nx;
+ Simd4f rvty = rvy - rvn * ny;
+ Simd4f rvtz = rvz - rvn * nz;
+
+ // calculate magnitude of vt
+ Simd4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon);
+
+ // magnitude of friction impulse (cannot be greater than -vt)
+ Simd4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, gSimd4fMinusOne) & mask;
+
+ impulse[0] = rvtx * j;
+ impulse[1] = rvty * j;
+ impulse[2] = rvtz * j;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideParticles()
+{
+ const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+ const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+ const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+ const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+ Simd4f curPos[4];
+ Simd4f prevPos[4];
+
+ float* __restrict prevIt = mClothData.mPrevParticles;
+ float* __restrict pIt = mClothData.mCurParticles;
+ float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+ for(; pIt < pEnd; pIt += 16, prevIt += 16)
+ {
+ curPos[0] = loadAligned(pIt, 0);
+ curPos[1] = loadAligned(pIt, 16);
+ curPos[2] = loadAligned(pIt, 32);
+ curPos[3] = loadAligned(pIt, 48);
+ transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+ ImpulseAccumulator accum;
+ Simd4i sphereMask = collideCones(curPos, accum);
+ collideSpheres(sphereMask, curPos, accum);
+
+ Simd4f mask;
+ if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+ continue;
+
+ Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+ if(frictionEnabled)
+ {
+ prevPos[0] = loadAligned(prevIt, 0);
+ prevPos[1] = loadAligned(prevIt, 16);
+ prevPos[2] = loadAligned(prevIt, 32);
+ prevPos[3] = loadAligned(prevIt, 48);
+ transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+ Simd4f frictionImpulse[3];
+ calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+ curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+ prevPos[0] = prevPos[0] - frictionImpulse[0];
+ prevPos[1] = prevPos[1] - frictionImpulse[1];
+ prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+ transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+ storeAligned(prevIt, 0, prevPos[0]);
+ storeAligned(prevIt, 16, prevPos[1]);
+ storeAligned(prevIt, 32, prevPos[2]);
+ storeAligned(prevIt, 48, prevPos[3]);
+ }
+
+ if(massScalingEnabled)
+ {
+ // calculate the inverse mass scale based on the collision impulse magnitude
+ Simd4f dSq = invNumCollisions * invNumCollisions *
+ (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+ Simd4f scale = recip(gSimd4fOne + massScale * dSq);
+
+ // scale invmass
+ curPos[3] = select(mask, curPos[3] * scale, curPos[3]);
+ }
+
+ curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+ curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+ curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+ transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+ storeAligned(pIt, 0, curPos[0]);
+ storeAligned(pIt, 16, curPos[1]);
+ storeAligned(pIt, 32, curPos[2]);
+ storeAligned(pIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+ mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideVirtualParticles()
+{
+ const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+ const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+ const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+ const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+ Simd4f curPos[3];
+
+ const float* __restrict weights = mClothData.mVirtualParticleWeights;
+ float* __restrict particles = mClothData.mCurParticles;
+ float* __restrict prevParticles = mClothData.mPrevParticles;
+
+ // move dummy particles outside of collision range
+ Simd4f* __restrict dummy = mClothData.mNumParticles + reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ Simd4f invGridScale = recip(mGridScale) & (mGridScale > gSimd4fEpsilon);
+ dummy[0] = dummy[1] = dummy[2] = invGridScale * mGridBias - invGridScale;
+
+ const uint16_t* __restrict vpIt = mClothData.mVirtualParticlesBegin;
+ const uint16_t* __restrict vpEnd = mClothData.mVirtualParticlesEnd;
+ for(; vpIt != vpEnd; vpIt += 16)
+ {
+ // load 12 particles and 4 weights
+ Simd4f p0v0 = loadAligned(particles, vpIt[0] * sizeof(PxVec4));
+ Simd4f p0v1 = loadAligned(particles, vpIt[1] * sizeof(PxVec4));
+ Simd4f p0v2 = loadAligned(particles, vpIt[2] * sizeof(PxVec4));
+ Simd4f w0 = loadAligned(weights, vpIt[3] * sizeof(PxVec4));
+
+ Simd4f p1v0 = loadAligned(particles, vpIt[4] * sizeof(PxVec4));
+ Simd4f p1v1 = loadAligned(particles, vpIt[5] * sizeof(PxVec4));
+ Simd4f p1v2 = loadAligned(particles, vpIt[6] * sizeof(PxVec4));
+ Simd4f w1 = loadAligned(weights, vpIt[7] * sizeof(PxVec4));
+
+ Simd4f p2v0 = loadAligned(particles, vpIt[8] * sizeof(PxVec4));
+ Simd4f p2v1 = loadAligned(particles, vpIt[9] * sizeof(PxVec4));
+ Simd4f p2v2 = loadAligned(particles, vpIt[10] * sizeof(PxVec4));
+ Simd4f w2 = loadAligned(weights, vpIt[11] * sizeof(PxVec4));
+
+ Simd4f p3v1 = loadAligned(particles, vpIt[13] * sizeof(PxVec4));
+ Simd4f p3v0 = loadAligned(particles, vpIt[12] * sizeof(PxVec4));
+ Simd4f p3v2 = loadAligned(particles, vpIt[14] * sizeof(PxVec4));
+ Simd4f w3 = loadAligned(weights, vpIt[15] * sizeof(PxVec4));
+
+ // interpolate particles and transpose
+ Simd4f px = p0v0 * splat<0>(w0) + p0v1 * splat<1>(w0) + p0v2 * splat<2>(w0);
+ Simd4f py = p1v0 * splat<0>(w1) + p1v1 * splat<1>(w1) + p1v2 * splat<2>(w1);
+ Simd4f pz = p2v0 * splat<0>(w2) + p2v1 * splat<1>(w2) + p2v2 * splat<2>(w2);
+ Simd4f pw = p3v0 * splat<0>(w3) + p3v1 * splat<1>(w3) + p3v2 * splat<2>(w3);
+ transpose(px, py, pz, pw);
+
+ curPos[0] = px;
+ curPos[1] = py;
+ curPos[2] = pz;
+
+ ImpulseAccumulator accum;
+ Simd4i sphereMask = collideCones(curPos, accum);
+ collideSpheres(sphereMask, curPos, accum);
+
+ Simd4f mask;
+ if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+ continue;
+
+ Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+ // displacement and transpose back
+ Simd4f d0 = accum.mDeltaX * invNumCollisions;
+ Simd4f d1 = accum.mDeltaY * invNumCollisions;
+ Simd4f d2 = accum.mDeltaZ * invNumCollisions;
+ Simd4f d3 = gSimd4fZero;
+ transpose(d0, d1, d2, d3);
+
+ // scale weights by 1/dot(w,w)
+ Simd4f rw0 = w0 * splat<3>(w0);
+ Simd4f rw1 = w1 * splat<3>(w1);
+ Simd4f rw2 = w2 * splat<3>(w2);
+ Simd4f rw3 = w3 * splat<3>(w3);
+
+ if(frictionEnabled)
+ {
+ Simd4f q0v0 = loadAligned(prevParticles, vpIt[0] * sizeof(PxVec4));
+ Simd4f q0v1 = loadAligned(prevParticles, vpIt[1] * sizeof(PxVec4));
+ Simd4f q0v2 = loadAligned(prevParticles, vpIt[2] * sizeof(PxVec4));
+
+ Simd4f q1v0 = loadAligned(prevParticles, vpIt[4] * sizeof(PxVec4));
+ Simd4f q1v1 = loadAligned(prevParticles, vpIt[5] * sizeof(PxVec4));
+ Simd4f q1v2 = loadAligned(prevParticles, vpIt[6] * sizeof(PxVec4));
+
+ Simd4f q2v0 = loadAligned(prevParticles, vpIt[8] * sizeof(PxVec4));
+ Simd4f q2v1 = loadAligned(prevParticles, vpIt[9] * sizeof(PxVec4));
+ Simd4f q2v2 = loadAligned(prevParticles, vpIt[10] * sizeof(PxVec4));
+
+ Simd4f q3v0 = loadAligned(prevParticles, vpIt[12] * sizeof(PxVec4));
+ Simd4f q3v1 = loadAligned(prevParticles, vpIt[13] * sizeof(PxVec4));
+ Simd4f q3v2 = loadAligned(prevParticles, vpIt[14] * sizeof(PxVec4));
+
+ // calculate previous interpolated positions
+ Simd4f qx = q0v0 * splat<0>(w0) + q0v1 * splat<1>(w0) + q0v2 * splat<2>(w0);
+ Simd4f qy = q1v0 * splat<0>(w1) + q1v1 * splat<1>(w1) + q1v2 * splat<2>(w1);
+ Simd4f qz = q2v0 * splat<0>(w2) + q2v1 * splat<1>(w2) + q2v2 * splat<2>(w2);
+ Simd4f qw = q3v0 * splat<0>(w3) + q3v1 * splat<1>(w3) + q3v2 * splat<2>(w3);
+ transpose(qx, qy, qz, qw);
+
+ Simd4f prevPos[3] = { qx, qy, qz };
+ Simd4f frictionImpulse[4];
+ frictionImpulse[3] = gSimd4fZero;
+
+ calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+ curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+ transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]);
+
+ q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]);
+ q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]);
+ q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]);
+
+ q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]);
+ q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]);
+ q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]);
+
+ q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]);
+ q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]);
+ q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]);
+
+ q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]);
+ q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]);
+ q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]);
+
+ // write back prev particles
+ storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0);
+ storeAligned(prevParticles, vpIt[1] * sizeof(PxVec4), q0v1);
+ storeAligned(prevParticles, vpIt[2] * sizeof(PxVec4), q0v2);
+
+ storeAligned(prevParticles, vpIt[4] * sizeof(PxVec4), q1v0);
+ storeAligned(prevParticles, vpIt[5] * sizeof(PxVec4), q1v1);
+ storeAligned(prevParticles, vpIt[6] * sizeof(PxVec4), q1v2);
+
+ storeAligned(prevParticles, vpIt[8] * sizeof(PxVec4), q2v0);
+ storeAligned(prevParticles, vpIt[9] * sizeof(PxVec4), q2v1);
+ storeAligned(prevParticles, vpIt[10] * sizeof(PxVec4), q2v2);
+
+ storeAligned(prevParticles, vpIt[12] * sizeof(PxVec4), q3v0);
+ storeAligned(prevParticles, vpIt[13] * sizeof(PxVec4), q3v1);
+ storeAligned(prevParticles, vpIt[14] * sizeof(PxVec4), q3v2);
+ }
+
+ if(massScalingEnabled)
+ {
+ // calculate the inverse mass scale based on the collision impulse
+ Simd4f dSq = invNumCollisions * invNumCollisions *
+ (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+ Simd4f weightScale = recip(gSimd4fOne + massScale * dSq);
+
+ weightScale = weightScale - gSimd4fOne;
+ Simd4f s0 = gSimd4fOne + splat<0>(weightScale) * (w0 & splat<0>(mask));
+ Simd4f s1 = gSimd4fOne + splat<1>(weightScale) * (w1 & splat<1>(mask));
+ Simd4f s2 = gSimd4fOne + splat<2>(weightScale) * (w2 & splat<2>(mask));
+ Simd4f s3 = gSimd4fOne + splat<3>(weightScale) * (w3 & splat<3>(mask));
+
+ p0v0 = p0v0 * (gSimd4fOneXYZ | (splat<0>(s0) & sMaskW));
+ p0v1 = p0v1 * (gSimd4fOneXYZ | (splat<1>(s0) & sMaskW));
+ p0v2 = p0v2 * (gSimd4fOneXYZ | (splat<2>(s0) & sMaskW));
+
+ p1v0 = p1v0 * (gSimd4fOneXYZ | (splat<0>(s1) & sMaskW));
+ p1v1 = p1v1 * (gSimd4fOneXYZ | (splat<1>(s1) & sMaskW));
+ p1v2 = p1v2 * (gSimd4fOneXYZ | (splat<2>(s1) & sMaskW));
+
+ p2v0 = p2v0 * (gSimd4fOneXYZ | (splat<0>(s2) & sMaskW));
+ p2v1 = p2v1 * (gSimd4fOneXYZ | (splat<1>(s2) & sMaskW));
+ p2v2 = p2v2 * (gSimd4fOneXYZ | (splat<2>(s2) & sMaskW));
+
+ p3v0 = p3v0 * (gSimd4fOneXYZ | (splat<0>(s3) & sMaskW));
+ p3v1 = p3v1 * (gSimd4fOneXYZ | (splat<1>(s3) & sMaskW));
+ p3v2 = p3v2 * (gSimd4fOneXYZ | (splat<2>(s3) & sMaskW));
+ }
+
+ p0v0 = p0v0 + (splat<0>(rw0) * d0);
+ p0v1 = p0v1 + (splat<1>(rw0) * d0);
+ p0v2 = p0v2 + (splat<2>(rw0) * d0);
+
+ p1v0 = p1v0 + (splat<0>(rw1) * d1);
+ p1v1 = p1v1 + (splat<1>(rw1) * d1);
+ p1v2 = p1v2 + (splat<2>(rw1) * d1);
+
+ p2v0 = p2v0 + (splat<0>(rw2) * d2);
+ p2v1 = p2v1 + (splat<1>(rw2) * d2);
+ p2v2 = p2v2 + (splat<2>(rw2) * d2);
+
+ p3v0 = p3v0 + (splat<0>(rw3) * d3);
+ p3v1 = p3v1 + (splat<1>(rw3) * d3);
+ p3v2 = p3v2 + (splat<2>(rw3) * d3);
+
+ // write back particles
+ storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0);
+ storeAligned(particles, vpIt[1] * sizeof(PxVec4), p0v1);
+ storeAligned(particles, vpIt[2] * sizeof(PxVec4), p0v2);
+
+ storeAligned(particles, vpIt[4] * sizeof(PxVec4), p1v0);
+ storeAligned(particles, vpIt[5] * sizeof(PxVec4), p1v1);
+ storeAligned(particles, vpIt[6] * sizeof(PxVec4), p1v2);
+
+ storeAligned(particles, vpIt[8] * sizeof(PxVec4), p2v0);
+ storeAligned(particles, vpIt[9] * sizeof(PxVec4), p2v1);
+ storeAligned(particles, vpIt[10] * sizeof(PxVec4), p2v2);
+
+ storeAligned(particles, vpIt[12] * sizeof(PxVec4), p3v0);
+ storeAligned(particles, vpIt[13] * sizeof(PxVec4), p3v1);
+ storeAligned(particles, vpIt[14] * sizeof(PxVec4), p3v2);
+
+#if PX_PROFILE || PX_DEBUG
+ mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideContinuousParticles()
+{
+ Simd4f curPos[4];
+ Simd4f prevPos[4];
+
+ const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+ const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+ const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+ const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+ float* __restrict prevIt = mClothData.mPrevParticles;
+ float* __restrict curIt = mClothData.mCurParticles;
+ float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+
+ for(; curIt < curEnd; curIt += 16, prevIt += 16)
+ {
+ prevPos[0] = loadAligned(prevIt, 0);
+ prevPos[1] = loadAligned(prevIt, 16);
+ prevPos[2] = loadAligned(prevIt, 32);
+ prevPos[3] = loadAligned(prevIt, 48);
+ transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+ curPos[0] = loadAligned(curIt, 0);
+ curPos[1] = loadAligned(curIt, 16);
+ curPos[2] = loadAligned(curIt, 32);
+ curPos[3] = loadAligned(curIt, 48);
+ transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+ ImpulseAccumulator accum;
+ Simd4i sphereMask = collideCones(prevPos, curPos, accum);
+ collideSpheres(sphereMask, prevPos, curPos, accum);
+
+ Simd4f mask;
+ if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+ continue;
+
+ Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+ if(frictionEnabled)
+ {
+ Simd4f frictionImpulse[3];
+ calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+ curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+ prevPos[0] = prevPos[0] - frictionImpulse[0];
+ prevPos[1] = prevPos[1] - frictionImpulse[1];
+ prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+ transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+ storeAligned(prevIt, 0, prevPos[0]);
+ storeAligned(prevIt, 16, prevPos[1]);
+ storeAligned(prevIt, 32, prevPos[2]);
+ storeAligned(prevIt, 48, prevPos[3]);
+ }
+
+ if(massScalingEnabled)
+ {
+ // calculate the inverse mass scale based on the collision impulse magnitude
+ Simd4f dSq = invNumCollisions * invNumCollisions *
+ (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+ Simd4f weightScale = recip(gSimd4fOne + massScale * dSq);
+
+ // scale invmass
+ curPos[3] = select(mask, curPos[3] * weightScale, curPos[3]);
+ }
+
+ curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+ curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+ curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+ transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+ storeAligned(curIt, 0, curPos[0]);
+ storeAligned(curIt, 16, curPos[1]);
+ storeAligned(curIt, 32, curPos[2]);
+ storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+ mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const IterationState<Simd4f>& state)
+{
+ if(!mClothData.mNumConvexes)
+ return;
+
+ // times 2 for plane equation result buffer
+ Simd4f* planes = static_cast<Simd4f*>(mAllocator.allocate(sizeof(Simd4f) * mClothData.mNumPlanes * 2));
+
+ const Simd4f* targetPlanes = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionPlanes);
+
+ // generate plane collision data
+ if(state.mRemainingIterations != 1)
+ {
+ // interpolate planes
+ LerpIterator<Simd4f, const Simd4f*> planeIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionPlanes),
+ targetPlanes, state.getCurrentAlpha());
+
+ // todo: normalize plane equations
+ generatePlanes(planes, planeIter, mClothData.mNumPlanes);
+ }
+ else
+ {
+ // otherwise use the target planes directly
+ generatePlanes(planes, targetPlanes, mClothData.mNumPlanes);
+ }
+
+ Simd4f curPos[4], prevPos[4];
+
+ const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+ const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+ float* __restrict curIt = mClothData.mCurParticles;
+ float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+ float* __restrict prevIt = mClothData.mPrevParticles;
+ for(; curIt < curEnd; curIt += 16, prevIt += 16)
+ {
+ curPos[0] = loadAligned(curIt, 0);
+ curPos[1] = loadAligned(curIt, 16);
+ curPos[2] = loadAligned(curIt, 32);
+ curPos[3] = loadAligned(curIt, 48);
+ transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+ ImpulseAccumulator accum;
+ collideConvexes(planes, curPos, accum);
+
+ Simd4f mask;
+ if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+ continue;
+
+ Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+ if(frictionEnabled)
+ {
+ prevPos[0] = loadAligned(prevIt, 0);
+ prevPos[1] = loadAligned(prevIt, 16);
+ prevPos[2] = loadAligned(prevIt, 32);
+ prevPos[3] = loadAligned(prevIt, 48);
+ transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+ Simd4f frictionImpulse[3];
+ calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+ curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+ prevPos[0] = prevPos[0] - frictionImpulse[0];
+ prevPos[1] = prevPos[1] - frictionImpulse[1];
+ prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+ transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+ storeAligned(prevIt, 0, prevPos[0]);
+ storeAligned(prevIt, 16, prevPos[1]);
+ storeAligned(prevIt, 32, prevPos[2]);
+ storeAligned(prevIt, 48, prevPos[3]);
+ }
+
+ curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+ curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+ curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+ transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+ storeAligned(curIt, 0, curPos[0]);
+ storeAligned(curIt, 16, curPos[1]);
+ storeAligned(curIt, 32, curPos[2]);
+ storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+ mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+ }
+
+ mAllocator.deallocate(planes);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const Simd4f* __restrict planes, Simd4f* __restrict curPos,
+ ImpulseAccumulator& accum)
+{
+ Simd4i result = gSimd4iZero;
+ Simd4i mask4 = gSimd4iOne;
+
+ const Simd4f* __restrict pIt, *pEnd = planes + mClothData.mNumPlanes;
+ Simd4f* __restrict dIt = const_cast<Simd4f*>(pEnd);
+ for(pIt = planes; pIt != pEnd; ++pIt, ++dIt)
+ {
+ *dIt = splat<3>(*pIt) + curPos[2] * splat<2>(*pIt) + curPos[1] * splat<1>(*pIt) + curPos[0] * splat<0>(*pIt);
+ result = result | (mask4 & simd4i(*dIt < gSimd4fZero));
+ mask4 = mask4 << 1; // todo: shift by Simd4i on consoles
+ }
+
+ if(allEqual(result, gSimd4iZero))
+ return;
+
+ const uint32_t* __restrict cIt = mClothData.mConvexMasks;
+ const uint32_t* __restrict cEnd = cIt + mClothData.mNumConvexes;
+ for(; cIt != cEnd; ++cIt)
+ {
+ uint32_t mask = *cIt;
+ mask4 = simd4i(int(mask));
+ if(!anyEqual(mask4 & result, mask4, mask4))
+ continue;
+
+ uint32_t test = mask - 1;
+ uint32_t planeIndex = findBitSet(mask & ~test);
+ Simd4f plane = planes[planeIndex];
+ Simd4f planeX = splat<0>(plane);
+ Simd4f planeY = splat<1>(plane);
+ Simd4f planeZ = splat<2>(plane);
+ Simd4f planeD = pEnd[planeIndex];
+ while(mask &= test)
+ {
+ test = mask - 1;
+ planeIndex = findBitSet(mask & ~test);
+ plane = planes[planeIndex];
+ Simd4f dist = pEnd[planeIndex];
+ Simd4f closer = dist > planeD;
+ planeX = select(closer, splat<0>(plane), planeX);
+ planeY = select(closer, splat<1>(plane), planeY);
+ planeZ = select(closer, splat<2>(plane), planeZ);
+ planeD = max(dist, planeD);
+ }
+
+ accum.subtract(planeX, planeY, planeZ, planeD, simd4f(mask4));
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const IterationState<Simd4f>& state)
+{
+ if(!mClothData.mNumCollisionTriangles)
+ return;
+
+ TriangleData* triangles =
+ static_cast<TriangleData*>(mAllocator.allocate(sizeof(TriangleData) * mClothData.mNumCollisionTriangles));
+
+ UnalignedIterator<Simd4f, 3> targetTriangles(mClothData.mTargetCollisionTriangles);
+
+ // generate triangle collision data
+ if(state.mRemainingIterations != 1)
+ {
+ // interpolate triangles
+ LerpIterator<Simd4f, UnalignedIterator<Simd4f, 3> > triangleIter(mClothData.mStartCollisionTriangles,
+ targetTriangles, state.getCurrentAlpha());
+
+ generateTriangles<Simd4f>(triangles, triangleIter, mClothData.mNumCollisionTriangles);
+ }
+ else
+ {
+ // otherwise use the target triangles directly
+ generateTriangles<Simd4f>(triangles, targetTriangles, mClothData.mNumCollisionTriangles);
+ }
+
+ Simd4f positions[4];
+
+ float* __restrict pIt = mClothData.mCurParticles;
+ float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+ for(; pIt < pEnd; pIt += 16)
+ {
+ positions[0] = loadAligned(pIt, 0);
+ positions[1] = loadAligned(pIt, 16);
+ positions[2] = loadAligned(pIt, 32);
+ positions[3] = loadAligned(pIt, 48);
+ transpose(positions[0], positions[1], positions[2], positions[3]);
+
+ ImpulseAccumulator accum;
+ collideTriangles(triangles, positions, accum);
+
+ Simd4f mask;
+ if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+ continue;
+
+ Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+ positions[0] = positions[0] + accum.mDeltaX * invNumCollisions;
+ positions[1] = positions[1] + accum.mDeltaY * invNumCollisions;
+ positions[2] = positions[2] + accum.mDeltaZ * invNumCollisions;
+
+ transpose(positions[0], positions[1], positions[2], positions[3]);
+ storeAligned(pIt, 0, positions[0]);
+ storeAligned(pIt, 16, positions[1]);
+ storeAligned(pIt, 32, positions[2]);
+ storeAligned(pIt, 48, positions[3]);
+
+#if PX_PROFILE || PX_DEBUG
+ mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+ }
+
+ mAllocator.deallocate(triangles);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const TriangleData* __restrict triangles, Simd4f* __restrict curPos,
+ ImpulseAccumulator& accum)
+{
+ Simd4f normalX, normalY, normalZ, normalD;
+ normalX = normalY = normalZ = normalD = gSimd4fZero;
+ Simd4f minSqrLength = gSimd4fFloatMax;
+
+ const TriangleData* __restrict tIt, *tEnd = triangles + mClothData.mNumCollisionTriangles;
+ for(tIt = triangles; tIt != tEnd; ++tIt)
+ {
+ Simd4f base = loadAligned(&tIt->base.x);
+ Simd4f edge0 = loadAligned(&tIt->edge0.x);
+ Simd4f edge1 = loadAligned(&tIt->edge1.x);
+ Simd4f normal = loadAligned(&tIt->normal.x);
+ Simd4f aux = loadAligned(&tIt->det);
+
+ Simd4f dx = curPos[0] - splat<0>(base);
+ Simd4f dy = curPos[1] - splat<1>(base);
+ Simd4f dz = curPos[2] - splat<2>(base);
+
+ Simd4f e0x = splat<0>(edge0);
+ Simd4f e0y = splat<1>(edge0);
+ Simd4f e0z = splat<2>(edge0);
+
+ Simd4f e1x = splat<0>(edge1);
+ Simd4f e1y = splat<1>(edge1);
+ Simd4f e1z = splat<2>(edge1);
+
+ Simd4f nx = splat<0>(normal);
+ Simd4f ny = splat<1>(normal);
+ Simd4f nz = splat<2>(normal);
+
+ Simd4f deltaDotEdge0 = dx * e0x + dy * e0y + dz * e0z;
+ Simd4f deltaDotEdge1 = dx * e1x + dy * e1y + dz * e1z;
+ Simd4f deltaDotNormal = dx * nx + dy * ny + dz * nz;
+
+ Simd4f edge0DotEdge1 = splat<3>(base);
+ Simd4f edge0SqrLength = splat<3>(edge0);
+ Simd4f edge1SqrLength = splat<3>(edge1);
+
+ Simd4f s = edge1SqrLength * deltaDotEdge0 - edge0DotEdge1 * deltaDotEdge1;
+ Simd4f t = edge0SqrLength * deltaDotEdge1 - edge0DotEdge1 * deltaDotEdge0;
+
+ Simd4f sPositive = s > gSimd4fZero;
+ Simd4f tPositive = t > gSimd4fZero;
+
+ Simd4f det = splat<0>(aux);
+
+ s = select(tPositive, s * det, deltaDotEdge0 * splat<2>(aux));
+ t = select(sPositive, t * det, deltaDotEdge1 * splat<3>(aux));
+
+ Simd4f clamp = gSimd4fOne < s + t;
+ Simd4f numerator = edge1SqrLength - edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1;
+
+ s = select(clamp, numerator * splat<1>(aux), s);
+
+ s = max(gSimd4fZero, min(gSimd4fOne, s));
+ t = max(gSimd4fZero, min(gSimd4fOne - s, t));
+
+ dx = dx - e0x * s - e1x * t;
+ dy = dy - e0y * s - e1y * t;
+ dz = dz - e0z * s - e1z * t;
+
+ Simd4f sqrLength = dx * dx + dy * dy + dz * dz;
+
+ // slightly increase distance for colliding triangles
+ Simd4f slack = (gSimd4fZero > deltaDotNormal) & simd4f(1e-4f);
+ sqrLength = sqrLength + sqrLength * slack;
+
+ Simd4f mask = sqrLength < minSqrLength;
+
+ normalX = select(mask, nx, normalX);
+ normalY = select(mask, ny, normalY);
+ normalZ = select(mask, nz, normalZ);
+ normalD = select(mask, deltaDotNormal, normalD);
+
+ minSqrLength = min(sqrLength, minSqrLength);
+ }
+
+ Simd4f mask;
+ if(!anyGreater(gSimd4fZero, normalD, mask))
+ return;
+
+ accum.subtract(normalX, normalY, normalZ, normalD, mask);
+}
+
+// explicit template instantiation
+#if NV_SIMD_SIMD
+template class cloth::SwCollision<Simd4f>;
+#endif
+#if NV_SIMD_SCALAR
+template class cloth::SwCollision<Scalar4f>;
+#endif
+
+/*
+namespace
+{
+ using namespace cloth;
+
+ int test()
+ {
+ Simd4f vertices[] = {
+ simd4f(0.0f, 0.0f, 0.0f, 0.0f),
+ simd4f(0.1f, 0.0f, 0.0f, 0.0f),
+ simd4f(0.0f, 0.1f, 0.0f, 0.0f)
+ };
+ TriangleData triangle;
+ generateTriangles<Simd4f>(&triangle, &*vertices, 1);
+
+ char buffer[1000];
+ SwKernelAllocator alloc(buffer, 1000);
+
+ SwClothData* cloth = static_cast<SwClothData*>(malloc(sizeof(SwClothData)));
+ memset(cloth, 0, sizeof(SwClothData));
+ cloth->mNumTriangles = 1;
+
+ SwCollision<Simd4f> collision(*cloth, alloc);
+ SwCollision<Simd4f>::ImpulseAccumulator accum;
+
+ Simd4f particles[4] = {};
+ for(float y=-0.1f; y < 0.0f; y += 0.2f)
+ {
+ for(float x=-0.1f; x < 0.0f; x += 0.2f)
+ {
+ particles[0] = simd4f(x);
+ particles[1] = simd4f(y);
+ particles[2] = simd4f(-1.0f);
+
+ collision.collideTriangles(&triangle, particles, accum);
+ }
+ }
+
+ return 0;
+ }
+
+ static int blah = test();
+}
+*/
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.h
new file mode 100644
index 00000000..bda3a57b
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.h
@@ -0,0 +1,138 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "StackAllocator.h"
+#include "Simd.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+template <typename>
+struct IterationState;
+struct IndexPair;
+struct SphereData;
+struct ConeData;
+struct TriangleData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+/**
+ Collision handler for SwSolver.
+ */
+template <typename Simd4f>
+class SwCollision
+{
+ typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+ public:
+ struct ShapeMask
+ {
+ Simd4i mCones;
+ Simd4i mSpheres;
+
+ ShapeMask& operator=(const ShapeMask&);
+ ShapeMask& operator&=(const ShapeMask&);
+ };
+
+ struct CollisionData
+ {
+ CollisionData();
+ SphereData* mSpheres;
+ ConeData* mCones;
+ };
+
+ struct ImpulseAccumulator;
+
+ public:
+ SwCollision(SwClothData& clothData, SwKernelAllocator& alloc);
+ ~SwCollision();
+
+ void operator()(const IterationState<Simd4f>& state);
+
+ static size_t estimateTemporaryMemory(const SwCloth& cloth);
+ static size_t estimatePersistentMemory(const SwCloth& cloth);
+
+ private:
+ SwCollision& operator=(const SwCollision&); // not implemented
+ void allocate(CollisionData&);
+ void deallocate(const CollisionData&);
+
+ void computeBounds();
+
+ void buildSphereAcceleration(const SphereData*);
+ void buildConeAcceleration();
+ static void mergeAcceleration(uint32_t*);
+ bool buildAcceleration();
+
+ static ShapeMask getShapeMask(const Simd4f&, const Simd4i*, const Simd4i*);
+ ShapeMask getShapeMask(const Simd4f*) const;
+ ShapeMask getShapeMask(const Simd4f*, const Simd4f*) const;
+
+ void collideSpheres(const Simd4i&, const Simd4f*, ImpulseAccumulator&) const;
+ Simd4i collideCones(const Simd4f*, ImpulseAccumulator&) const;
+
+ void collideSpheres(const Simd4i&, const Simd4f*, Simd4f*, ImpulseAccumulator&) const;
+ Simd4i collideCones(const Simd4f*, Simd4f*, ImpulseAccumulator&) const;
+
+ void collideParticles();
+ void collideVirtualParticles();
+ void collideContinuousParticles();
+
+ void collideConvexes(const IterationState<Simd4f>&);
+ void collideConvexes(const Simd4f*, Simd4f*, ImpulseAccumulator&);
+
+ void collideTriangles(const IterationState<Simd4f>&);
+ void collideTriangles(const TriangleData*, Simd4f*, ImpulseAccumulator&);
+
+ public:
+ // acceleration structure
+ static const uint32_t sGridSize = 8;
+ Simd4i mSphereGrid[6 * sGridSize / 4];
+ Simd4i mConeGrid[6 * sGridSize / 4];
+ Simd4f mGridScale, mGridBias;
+
+ CollisionData mPrevData;
+ CollisionData mCurData;
+
+ SwClothData& mClothData;
+ SwKernelAllocator& mAllocator;
+
+ uint32_t mNumCollisions;
+
+ static const Simd4f sSkeletonWidth;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/SwCollisionHelpers.h
new file mode 100644
index 00000000..230685bb
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCollisionHelpers.h
@@ -0,0 +1,84 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Simd.h"
+
+// platform specific helpers
+
+namespace physx
+{
+namespace cloth
+{
+
+inline uint32_t findBitSet(uint32_t mask);
+
+// intFloor(-1.0f) returns -2 on SSE and NEON!
+inline Simd4i intFloor(const Simd4f& v);
+
+inline Simd4i horizontalOr(const Simd4i& mask);
+
+template <typename>
+struct Gather;
+
+#if NV_SIMD_SIMD
+template <>
+struct Gather<Simd4i>
+{
+ inline Gather(const Simd4i& index);
+ inline Simd4i operator()(const Simd4i*) const;
+
+#if NV_SIMD_SSE2
+ Simd4i mSelectQ, mSelectD, mSelectW;
+ static const Simd4i sIntSignBit;
+ static const Simd4i sSignedMask;
+#elif NV_SIMD_NEON
+ Simd4i mPermute;
+ static const Simd4i sPack;
+ static const Simd4i sOffset;
+ static const Simd4i sShift;
+ static const Simd4i sMask;
+#endif
+ Simd4i mOutOfRange;
+};
+#endif
+
+} // namespace cloth
+} // namespace physx
+
+#if NV_SIMD_SSE2
+#include "sse2/SwCollisionHelpers.h"
+#elif NV_SIMD_NEON
+#include "neon/SwCollisionHelpers.h"
+#endif
+
+#if NV_SIMD_SCALAR
+#include "scalar/SwCollisionHelpers.h"
+#endif
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.cpp
new file mode 100644
index 00000000..aa7f8356
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.cpp
@@ -0,0 +1,177 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxAssert.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "PsSort.h"
+#include "limits.h" // for USHRT_MAX
+#include "PsUtilities.h"
+
+using namespace physx;
+using namespace shdfnd;
+
+cloth::SwTether::SwTether(uint16_t anchor, float length) : mAnchor(anchor), mLength(length)
+{
+}
+
+cloth::SwFabric::SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases,
+ Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices,
+ Range<const uint32_t> anchors, Range<const float> tetherLengths,
+ Range<const uint32_t> triangles, uint32_t id)
+: mFactory(factory), mNumParticles(numParticles), mTetherLengthScale(1.0f), mId(id)
+{
+ // should no longer be prefixed with 0
+ PX_ASSERT(sets.front() != 0);
+
+#if PX_WINDOWS
+ const uint32_t kSimdWidth = 8; // avx
+#else
+ const uint32_t kSimdWidth = 4;
+#endif
+
+ // consistency check
+ PX_ASSERT(sets.back() == restvalues.size());
+ PX_ASSERT(restvalues.size() * 2 == indices.size());
+ PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end()));
+ PX_ASSERT(mNumParticles + kSimdWidth - 1 <= USHRT_MAX);
+
+ mPhases.assign(phases.begin(), phases.end());
+ mSets.reserve(sets.size() + 1);
+ mSets.pushBack(0); // prefix with 0
+
+ mOriginalNumRestvalues = uint32_t(restvalues.size());
+
+ // padd indices for SIMD
+ const uint32_t* iBegin = indices.begin(), *iIt = iBegin;
+ const float* rBegin = restvalues.begin(), *rIt = rBegin;
+ const uint32_t* sIt, *sEnd = sets.end();
+ for(sIt = sets.begin(); sIt != sEnd; ++sIt)
+ {
+ const float* rEnd = rBegin + *sIt;
+ const uint32_t* iEnd = iBegin + *sIt * 2;
+ uint32_t numConstraints = uint32_t(rEnd - rIt);
+
+ for(; rIt != rEnd; ++rIt)
+ mRestvalues.pushBack(*rIt);
+
+ for(; iIt != iEnd; ++iIt)
+ mIndices.pushBack(uint16_t(*iIt));
+
+ // add dummy indices to make multiple of 4
+ for(; numConstraints &= kSimdWidth - 1; ++numConstraints)
+ {
+ mRestvalues.pushBack(-FLT_MAX);
+ uint32_t index = mNumParticles + numConstraints - 1;
+ mIndices.pushBack(uint16_t(index));
+ mIndices.pushBack(uint16_t(index));
+ }
+
+ mSets.pushBack(uint32_t(mRestvalues.size()));
+ }
+
+ // trim overallocations
+ RestvalueContainer(mRestvalues.begin(), mRestvalues.end()).swap(mRestvalues);
+ Vector<uint16_t>::Type(mIndices.begin(), mIndices.end()).swap(mIndices);
+
+ // tethers
+ PX_ASSERT(anchors.size() == tetherLengths.size());
+
+ // pad to allow for direct 16 byte (unaligned) loads
+ mTethers.reserve(anchors.size() + 2);
+ for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront())
+ mTethers.pushBack(SwTether(uint16_t(anchors.front()), tetherLengths.front()));
+
+ // triangles
+ mTriangles.reserve(triangles.size());
+ const uint32_t* iEnd = triangles.end();
+ for(iIt = triangles.begin(); iIt != iEnd; ++iIt)
+ mTriangles.pushBack(uint16_t(*iIt));
+
+ mFactory.mFabrics.pushBack(this);
+}
+
+cloth::SwFabric::~SwFabric()
+{
+ Vector<SwFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this);
+ PX_ASSERT(fIt != mFactory.mFabrics.end());
+ mFactory.mFabrics.replaceWithLast(fIt);
+}
+
+cloth::Factory& physx::cloth::SwFabric::getFactory() const
+{
+ return mFactory;
+}
+
+uint32_t cloth::SwFabric::getNumPhases() const
+{
+ return uint32_t(mPhases.size());
+}
+
+uint32_t cloth::SwFabric::getNumRestvalues() const
+{
+ return mOriginalNumRestvalues;
+}
+
+uint32_t cloth::SwFabric::getNumSets() const
+{
+ return uint32_t(mSets.size() - 1);
+}
+
+uint32_t cloth::SwFabric::getNumIndices() const
+{
+ return 2 * mOriginalNumRestvalues;
+}
+
+uint32_t cloth::SwFabric::getNumParticles() const
+{
+ return mNumParticles;
+}
+
+uint32_t physx::cloth::SwFabric::getNumTethers() const
+{
+ return uint32_t(mTethers.size());
+}
+
+uint32_t physx::cloth::SwFabric::getNumTriangles() const
+{
+ return uint32_t(mTriangles.size()) / 3;
+}
+
+void physx::cloth::SwFabric::scaleRestvalues(float scale)
+{
+ RestvalueContainer::Iterator rIt, rEnd = mRestvalues.end();
+ for(rIt = mRestvalues.begin(); rIt != rEnd; ++rIt)
+ *rIt *= scale;
+}
+
+void physx::cloth::SwFabric::scaleTetherLengths(float scale)
+{
+ mTetherLengthScale *= scale;
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.h b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.h
new file mode 100644
index 00000000..b762bcb0
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.h
@@ -0,0 +1,109 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxVec4.h"
+#include "Allocator.h"
+#include "Fabric.h"
+#include "Types.h"
+#include "Range.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class SwFactory;
+
+struct SwTether
+{
+ SwTether(uint16_t, float);
+ uint16_t mAnchor;
+ float mLength;
+};
+
+class SwFabric : public UserAllocated, public Fabric
+{
+ public:
+#if PX_WINDOWS
+ typedef AlignedVector<float, 32>::Type RestvalueContainer; // avx
+#else
+ typedef AlignedVector<float, 16>::Type RestvalueContainer;
+#endif
+
+ SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+ Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors,
+ Range<const float> tetherLengths, Range<const uint32_t> triangles, uint32_t id);
+
+ SwFabric& operator=(const SwFabric&);
+
+ virtual ~SwFabric();
+
+ virtual Factory& getFactory() const;
+
+ virtual uint32_t getNumPhases() const;
+ virtual uint32_t getNumRestvalues() const;
+
+ virtual uint32_t getNumSets() const;
+ virtual uint32_t getNumIndices() const;
+
+ virtual uint32_t getNumParticles() const;
+
+ virtual uint32_t getNumTethers() const;
+
+ virtual uint32_t getNumTriangles() const;
+
+ virtual void scaleRestvalues(float);
+ virtual void scaleTetherLengths(float);
+
+ public:
+ SwFactory& mFactory;
+
+ uint32_t mNumParticles;
+
+ Vector<uint32_t>::Type mPhases; // index of set to use
+ Vector<uint32_t>::Type mSets; // offset of first restvalue, with 0 prefix
+
+ RestvalueContainer mRestvalues; // rest values (edge length)
+ Vector<uint16_t>::Type mIndices; // particle index pairs
+
+ Vector<SwTether>::Type mTethers;
+ float mTetherLengthScale;
+
+ Vector<uint16_t>::Type mTriangles;
+
+ uint32_t mId;
+
+ uint32_t mOriginalNumRestvalues;
+
+} PX_ALIGN_SUFFIX(16);
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.cpp
new file mode 100644
index 00000000..92f17c98
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.cpp
@@ -0,0 +1,297 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMemory.h"
+#include "SwFactory.h"
+#include "SwFabric.h"
+#include "SwCloth.h"
+#include "SwSolver.h"
+#include "ClothImpl.h"
+#include <string.h> // for memcpy
+
+using namespace physx;
+
+namespace physx
+{
+namespace cloth
+{
+// defined in Factory.cpp
+uint32_t getNextFabricId();
+}
+}
+
+cloth::SwFactory::SwFactory() : Factory(CPU)
+{
+}
+
+cloth::SwFactory::~SwFactory()
+{
+}
+
+cloth::Fabric* cloth::SwFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases,
+ Range<const uint32_t> sets, Range<const float> restvalues,
+ Range<const uint32_t> indices, Range<const uint32_t> anchors,
+ Range<const float> tetherLengths, Range<const uint32_t> triangles)
+{
+ return new SwFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths, triangles,
+ getNextFabricId());
+}
+
+cloth::Cloth* cloth::SwFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
+{
+ return new SwClothImpl(*this, fabric, particles);
+}
+
+cloth::Solver* cloth::SwFactory::createSolver(physx::PxTaskManager* taskMgr)
+{
+#ifdef PX_PHYSX_GPU_EXPORTS
+ // SwSolver not defined in PhysXGpu project
+ PX_UNUSED(taskMgr);
+ return 0;
+#else
+ return new SwSolver(taskMgr);
+#endif
+}
+
+cloth::Cloth* cloth::SwFactory::clone(const Cloth& cloth)
+{
+ if(cloth.getFactory().getPlatform() != Factory::CPU)
+ return cloth.clone(*this); // forward to CuCloth
+
+ // copy construct
+ return new SwClothImpl(*this, static_cast<const SwClothImpl&>(cloth));
+}
+
+void cloth::SwFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+ Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+ Range<float> tetherLengths, Range<uint32_t> triangles) const
+{
+ const SwFabric& swFabric = static_cast<const SwFabric&>(fabric);
+
+ PX_ASSERT(phases.empty() || phases.size() == swFabric.getNumPhases());
+ PX_ASSERT(restvalues.empty() || restvalues.size() == swFabric.getNumRestvalues());
+ PX_ASSERT(sets.empty() || sets.size() == swFabric.getNumSets());
+ PX_ASSERT(indices.empty() || indices.size() == swFabric.getNumIndices());
+ PX_ASSERT(anchors.empty() || anchors.size() == swFabric.getNumTethers());
+ PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == swFabric.getNumTethers());
+
+ for(uint32_t i = 0; !phases.empty(); ++i, phases.popFront())
+ phases.front() = swFabric.mPhases[i];
+
+ const uint32_t* sEnd = swFabric.mSets.end(), *sIt;
+ const float* rBegin = swFabric.mRestvalues.begin(), *rIt = rBegin;
+ const uint16_t* iIt = swFabric.mIndices.begin();
+
+ uint32_t* sDst = sets.begin();
+ float* rDst = restvalues.begin();
+ uint32_t* iDst = indices.begin();
+
+ uint32_t numConstraints = 0;
+ for(sIt = swFabric.mSets.begin(); ++sIt != sEnd;)
+ {
+ const float* rEnd = rBegin + *sIt;
+ for(; rIt != rEnd; ++rIt)
+ {
+ uint16_t i0 = *iIt++;
+ uint16_t i1 = *iIt++;
+
+ if(PxMax(i0, i1) >= swFabric.mNumParticles)
+ continue;
+
+ if(!restvalues.empty())
+ *rDst++ = *rIt;
+
+ if(!indices.empty())
+ {
+ *iDst++ = i0;
+ *iDst++ = i1;
+ }
+
+ ++numConstraints;
+ }
+
+ if(!sets.empty())
+ *sDst++ = numConstraints;
+ }
+
+ for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront())
+ anchors.front() = swFabric.mTethers[i].mAnchor;
+
+ for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront())
+ tetherLengths.front() = swFabric.mTethers[i].mLength * swFabric.mTetherLengthScale;
+
+ for(uint32_t i = 0; !triangles.empty(); ++i, triangles.popFront())
+ triangles.front() = swFabric.mTriangles[i];
+}
+
+void cloth::SwFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+ Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+ PX_ASSERT(spheres.empty() || spheres.size() == swCloth.mStartCollisionSpheres.size());
+ PX_ASSERT(capsules.empty() || capsules.size() == swCloth.mCapsuleIndices.size() * 2);
+ PX_ASSERT(planes.empty() || planes.size() == swCloth.mStartCollisionPlanes.size());
+ PX_ASSERT(convexes.empty() || convexes.size() == swCloth.mConvexMasks.size());
+ PX_ASSERT(triangles.empty() || triangles.size() == swCloth.mStartCollisionTriangles.size());
+
+ if(!swCloth.mStartCollisionSpheres.empty() && !spheres.empty())
+ memcpy(spheres.begin(), &swCloth.mStartCollisionSpheres.front(),
+ swCloth.mStartCollisionSpheres.size() * sizeof(PxVec4));
+
+ if(!swCloth.mCapsuleIndices.empty() && !capsules.empty())
+ memcpy(capsules.begin(), &swCloth.mCapsuleIndices.front(), swCloth.mCapsuleIndices.size() * sizeof(IndexPair));
+
+ if(!swCloth.mStartCollisionPlanes.empty() && !planes.empty())
+ memcpy(planes.begin(), &swCloth.mStartCollisionPlanes.front(),
+ swCloth.mStartCollisionPlanes.size() * sizeof(PxVec4));
+
+ if(!swCloth.mConvexMasks.empty() && !convexes.empty())
+ memcpy(convexes.begin(), &swCloth.mConvexMasks.front(), swCloth.mConvexMasks.size() * sizeof(uint32_t));
+
+ if(!swCloth.mStartCollisionTriangles.empty() && !triangles.empty())
+ memcpy(triangles.begin(), &swCloth.mStartCollisionTriangles.front(),
+ swCloth.mStartCollisionTriangles.size() * sizeof(PxVec3));
+}
+
+void cloth::SwFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+ Vec4fAlignedVector const& srcConstraints = !swCloth.mMotionConstraints.mTarget.empty()
+ ? swCloth.mMotionConstraints.mTarget
+ : swCloth.mMotionConstraints.mStart;
+
+ if(!srcConstraints.empty())
+ {
+ // make sure dest array is big enough
+ PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+ memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4));
+ }
+}
+
+void cloth::SwFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+ Vec4fAlignedVector const& srcConstraints = !swCloth.mSeparationConstraints.mTarget.empty()
+ ? swCloth.mSeparationConstraints.mTarget
+ : swCloth.mSeparationConstraints.mStart;
+
+ if(!srcConstraints.empty())
+ {
+ // make sure dest array is big enough
+ PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+ memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4));
+ }
+}
+
+void cloth::SwFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+ if(!swCloth.mParticleAccelerations.empty())
+ {
+ // make sure dest array is big enough
+ PX_ASSERT(destAccelerations.size() == swCloth.mParticleAccelerations.size());
+
+ memcpy(destAccelerations.begin(), &swCloth.mParticleAccelerations.front(),
+ swCloth.mParticleAccelerations.size() * sizeof(PxVec4));
+ }
+}
+
+void cloth::SwFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> indices, Range<PxVec3> weights) const
+{
+ PX_ASSERT(this == &cloth.getFactory());
+
+ const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+ uint32_t numIndices = cloth.getNumVirtualParticles();
+ uint32_t numWeights = cloth.getNumVirtualParticleWeights();
+
+ PX_ASSERT(indices.size() == numIndices || indices.empty());
+ PX_ASSERT(weights.size() == numWeights || weights.empty());
+
+ if(weights.size() == numWeights)
+ {
+ PxVec3* wDestIt = reinterpret_cast<PxVec3*>(weights.begin());
+
+ // convert weights from vec4 to vec3
+ cloth::Vec4fAlignedVector::ConstIterator wIt = swCloth.mVirtualParticleWeights.begin();
+ cloth::Vec4fAlignedVector::ConstIterator wEnd = wIt + numWeights;
+
+ for(; wIt != wEnd; ++wIt, ++wDestIt)
+ *wDestIt = PxVec3(wIt->x, wIt->y, wIt->z);
+
+ PX_ASSERT(wDestIt == weights.end());
+ }
+ if(indices.size() == numIndices)
+ {
+ // convert indices
+ Vec4u* iDestIt = reinterpret_cast<Vec4u*>(indices.begin());
+ Vector<Vec4us>::Type::ConstIterator iIt = swCloth.mVirtualParticleIndices.begin();
+ Vector<Vec4us>::Type::ConstIterator iEnd = swCloth.mVirtualParticleIndices.end();
+
+ uint32_t numParticles = uint32_t(swCloth.mCurParticles.size());
+
+ for(; iIt != iEnd; ++iIt)
+ {
+ // skip dummy indices
+ if(iIt->x < numParticles)
+ // byte offset to element index
+ *iDestIt++ = Vec4u(*iIt);
+ }
+
+ PX_ASSERT(&array(*iDestIt) == indices.end());
+ }
+}
+
+void cloth::SwFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const
+{
+ const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+ PX_ASSERT(destIndices.size() == swCloth.mSelfCollisionIndices.size());
+ PxMemCopy(destIndices.begin(), swCloth.mSelfCollisionIndices.begin(), destIndices.size() * sizeof(uint32_t));
+}
+
+void cloth::SwFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const
+{
+ const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+ PX_ASSERT(destRestPositions.size() == swCloth.mRestPositions.size());
+ PxMemCopy(destRestPositions.begin(), swCloth.mRestPositions.begin(), destRestPositions.size() * sizeof(PxVec4));
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.h b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.h
new file mode 100644
index 00000000..154fb965
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.h
@@ -0,0 +1,90 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class SwFabric;
+class SwCloth;
+template <typename>
+class ClothImpl;
+
+class SwFactory : public UserAllocated, public Factory
+{
+ public:
+ typedef SwFabric FabricType;
+ typedef ClothImpl<SwCloth> ImplType;
+
+ SwFactory();
+ virtual ~SwFactory();
+
+ virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+ Range<const float> restvalues, Range<const uint32_t> indices,
+ Range<const uint32_t> anchors, Range<const float> tetherLengths,
+ Range<const uint32_t> triangles);
+
+ virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric);
+
+ virtual Solver* createSolver(physx::PxTaskManager*);
+
+ virtual Cloth* clone(const Cloth& cloth);
+
+ virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+ Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+ Range<float> tetherLengths, Range<uint32_t> triangles) const;
+
+ virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+ Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const;
+
+ virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+ virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+ virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const;
+
+ virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+ Range<PxVec3> destWeights) const;
+
+ virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const;
+
+ virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const;
+
+ public:
+ Vector<SwFabric*>::Type mFabrics;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.cpp
new file mode 100644
index 00000000..d0c8691a
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.cpp
@@ -0,0 +1,714 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "foundation/PxMemory.h"
+#include "SwInterCollision.h"
+#include "SwCollisionHelpers.h"
+#include "BoundingBox.h"
+#include "PsIntrinsics.h"
+#include "PsSort.h"
+
+using namespace physx;
+
+namespace
+{
+
+const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fScalarFactory sEpsilon = simd4f(FLT_EPSILON);
+const Simd4fTupleFactory sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+// returns sorted indices, output needs to be at least 2*(last-first)+1024
+void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out)
+{
+ uint32_t n = uint32_t(last - first);
+
+ uint32_t* buffer = out + 2 * n;
+ uint32_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
+
+ PxMemZero(buffer, 1024 * sizeof(uint32_t));
+
+ // build 3 histograms in one pass
+ for(const uint32_t* __restrict it = first; it != last; ++it)
+ {
+ uint32_t key = *it;
+ ++histograms[0][0xff & key];
+ ++histograms[1][0xff & (key >> 8)];
+ ++histograms[2][0xff & (key >> 16)];
+ ++histograms[3][key >> 24];
+ }
+
+ // convert histograms to offset tables in-place
+ uint32_t sums[4] = {};
+ for(uint32_t i = 0; i < 256; ++i)
+ {
+ uint32_t temp0 = histograms[0][i] + sums[0];
+ histograms[0][i] = sums[0];
+ sums[0] = temp0;
+
+ uint32_t temp1 = histograms[1][i] + sums[1];
+ histograms[1][i] = sums[1];
+ sums[1] = temp1;
+
+ uint32_t temp2 = histograms[2][i] + sums[2];
+ histograms[2][i] = sums[2];
+ sums[2] = temp2;
+
+ uint32_t temp3 = histograms[3][i] + sums[3];
+ histograms[3][i] = sums[3];
+ sums[3] = temp3;
+ }
+
+ PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n);
+
+#if PX_DEBUG
+ memset(out, 0xff, 2 * n * sizeof(uint32_t));
+#endif
+
+ // sort 8 bits per pass
+
+ uint32_t* __restrict indices[] = { out, out + n };
+
+ for(uint32_t i = 0; i != n; ++i)
+ indices[1][histograms[0][0xff & first[i]]++] = i;
+
+ for(uint32_t i = 0, index; i != n; ++i)
+ {
+ index = indices[1][i];
+ indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index;
+ }
+
+ for(uint32_t i = 0, index; i != n; ++i)
+ {
+ index = indices[0][i];
+ indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index;
+ }
+
+ for(uint32_t i = 0, index; i != n; ++i)
+ {
+ index = indices[1][i];
+ indices[0][histograms[3][first[index] >> 24]++] = index;
+ }
+}
+
+template <typename Simd4f>
+uint32_t longestAxis(const Simd4f& edgeLength)
+{
+ const float* e = array(edgeLength);
+
+ if(e[0] > e[1])
+ return uint32_t(e[0] > e[2] ? 0 : 2);
+ else
+ return uint32_t(e[1] > e[2] ? 1 : 2);
+}
+}
+
+template <typename Simd4f>
+cloth::SwInterCollision<Simd4f>::SwInterCollision(const cloth::SwInterCollisionData* instances, uint32_t n,
+ float colDist, float stiffness, uint32_t iterations,
+ InterCollisionFilter filter, cloth::SwKernelAllocator& alloc)
+: mInstances(instances)
+, mNumInstances(n)
+, mClothIndices(NULL)
+, mParticleIndices(NULL)
+, mNumParticles(0)
+, mTotalParticles(0)
+, mFilter(filter)
+, mAllocator(alloc)
+{
+ PX_ASSERT(mFilter);
+
+ mCollisionDistance = simd4f(colDist, colDist, colDist, 0.0f);
+ mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
+ mStiffness = simd4f(stiffness);
+ mNumIterations = iterations;
+
+ // calculate particle size
+ for(uint32_t i = 0; i < n; ++i)
+ mTotalParticles += instances[i].mNumParticles;
+}
+
+template <typename Simd4f>
+cloth::SwInterCollision<Simd4f>::~SwInterCollision()
+{
+}
+
+namespace
+{
+// multiple x by m leaving w component of x intact
+template <typename Simd4f>
+PX_INLINE Simd4f transform(const Simd4f m[4], const Simd4f& x)
+{
+ const Simd4f a = m[3] + splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2];
+ return select(sMaskXYZ, a, x);
+}
+
+// rotate x by m leaving w component intact
+template <typename Simd4f>
+PX_INLINE Simd4f rotate(const Simd4f m[4], const Simd4f& x)
+{
+ const Simd4f a = splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2];
+ return select(sMaskXYZ, a, x);
+}
+
+template <typename Simd4f>
+struct ClothSorter
+{
+ typedef cloth::BoundingBox<Simd4f> BoundingBox;
+
+ ClothSorter(BoundingBox* bounds, uint32_t n, uint32_t axis) : mBounds(bounds), mNumBounds(n), mAxis(axis)
+ {
+ }
+
+ bool operator()(uint32_t i, uint32_t j) const
+ {
+ PX_ASSERT(i < mNumBounds);
+ PX_ASSERT(j < mNumBounds);
+
+ return array(mBounds[i].mLower)[mAxis] < array(mBounds[j].mLower)[mAxis];
+ }
+
+ BoundingBox* mBounds;
+ uint32_t mNumBounds;
+ uint32_t mAxis;
+};
+
+// for the given cloth array this function calculates the set of particles
+// which potentially interact, the potential colliders are returned with their
+// cloth index and particle index in clothIndices and particleIndices, the
+// function returns the number of potential colliders
+template <typename Simd4f>
+uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, const cloth::SwInterCollisionData* cEnd,
+ const Simd4f& colDist, uint16_t* clothIndices, uint32_t* particleIndices,
+ cloth::BoundingBox<Simd4f>& bounds, uint32_t* overlapMasks,
+ cloth::InterCollisionFilter filter, cloth::SwKernelAllocator& allocator)
+{
+ using namespace cloth;
+
+ typedef BoundingBox<Simd4f> BoundingBox;
+
+ uint32_t numParticles = 0;
+ const uint32_t numCloths = uint32_t(cEnd - cBegin);
+
+ // bounds of each cloth objects in world space
+ BoundingBox* const clothBounds = static_cast<BoundingBox*>(allocator.allocate(numCloths * sizeof(BoundingBox)));
+ BoundingBox* const overlapBounds = static_cast<BoundingBox*>(allocator.allocate(numCloths * sizeof(BoundingBox)));
+
+ // union of all cloth world bounds
+ BoundingBox totalClothBounds = emptyBounds<Simd4f>();
+
+ uint32_t* sortedIndices = static_cast<uint32_t*>(allocator.allocate(numCloths * sizeof(uint32_t)));
+
+ for(uint32_t i = 0; i < numCloths; ++i)
+ {
+ const SwInterCollisionData& c = cBegin[i];
+
+ // transform bounds from b local space to local space of a
+ PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
+ PX_ASSERT(!lcBounds.isEmpty());
+ PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose, lcBounds);
+
+ BoundingBox cBounds = { simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f),
+ simd4f(cWorld.maximum.x, cWorld.maximum.y, cWorld.maximum.z, 0.0f) };
+
+ sortedIndices[i] = i;
+ clothBounds[i] = cBounds;
+
+ totalClothBounds = expandBounds(totalClothBounds, cBounds);
+ }
+
+ // sort indices by their minimum extent on the longest axis
+ const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower);
+
+ ClothSorter<Simd4f> predicate(clothBounds, numCloths, sweepAxis);
+ shdfnd::sort(sortedIndices, numCloths, predicate);
+
+ for(uint32_t i = 0; i < numCloths; ++i)
+ {
+ PX_ASSERT(sortedIndices[i] < numCloths);
+
+ const SwInterCollisionData& a = cBegin[sortedIndices[i]];
+
+ // local bounds
+ const Simd4f aCenter = load(reinterpret_cast<const float*>(&a.mBoundsCenter));
+ const Simd4f aHalfExtent = load(reinterpret_cast<const float*>(&a.mBoundsHalfExtent)) + colDist;
+ const BoundingBox aBounds = { aCenter - aHalfExtent, aCenter + aHalfExtent };
+
+ const PxMat44 aToWorld(a.mGlobalPose);
+ const PxTransform aToLocal(a.mGlobalPose.getInverse());
+
+ const float axisMin = array(clothBounds[sortedIndices[i]].mLower)[sweepAxis];
+ const float axisMax = array(clothBounds[sortedIndices[i]].mUpper)[sweepAxis];
+
+ uint32_t overlapMask = 0;
+ uint32_t numOverlaps = 0;
+
+ // scan back to find first intersecting bounding box
+ uint32_t startIndex = i;
+ while(startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin)
+ --startIndex;
+
+ // compute all overlapping bounds
+ for(uint32_t j = startIndex; j < numCloths; ++j)
+ {
+ // ignore self-collision
+ if(i == j)
+ continue;
+
+ // early out if no more cloths along axis intersect us
+ if(array(clothBounds[sortedIndices[j]].mLower)[sweepAxis] > axisMax)
+ break;
+
+ const SwInterCollisionData& b = cBegin[sortedIndices[j]];
+
+ // check if collision between these shapes is filtered
+ if(!filter(a.mUserData, b.mUserData))
+ continue;
+
+ // set mask bit for this cloth
+ overlapMask |= 1 << sortedIndices[j];
+
+ // transform bounds from b local space to local space of a
+ PxBounds3 lcBounds =
+ PxBounds3::centerExtents(b.mBoundsCenter, b.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
+ PX_ASSERT(!lcBounds.isEmpty());
+ PxBounds3 bLocal = PxBounds3::transformFast(aToLocal * b.mGlobalPose, lcBounds);
+
+ BoundingBox bBounds = { simd4f(bLocal.minimum.x, bLocal.minimum.y, bLocal.minimum.z, 0.0f),
+ simd4f(bLocal.maximum.x, bLocal.maximum.y, bLocal.maximum.z, 0.0f) };
+
+ BoundingBox iBounds = intersectBounds(aBounds, bBounds);
+
+ // setup bounding box w to make point containment test cheaper
+ Simd4f floatMax = gSimd4fFloatMax & static_cast<Simd4f>(sMaskW);
+ iBounds.mLower = (iBounds.mLower & sMaskXYZ) | -floatMax;
+ iBounds.mUpper = (iBounds.mUpper & sMaskXYZ) | floatMax;
+
+ if(!isEmptyBounds(iBounds))
+ overlapBounds[numOverlaps++] = iBounds;
+ }
+
+ //----------------------------------------------------------------
+ // cull all particles to overlapping bounds and transform particles to world space
+
+ const uint32_t clothIndex = sortedIndices[i];
+ overlapMasks[clothIndex] = overlapMask;
+
+ Simd4f* pBegin = reinterpret_cast<Simd4f*>(a.mParticles);
+ Simd4f* qBegin = reinterpret_cast<Simd4f*>(a.mPrevParticles);
+
+ const Simd4f xform[4] = { load(reinterpret_cast<const float*>(&aToWorld.column0)),
+ load(reinterpret_cast<const float*>(&aToWorld.column1)),
+ load(reinterpret_cast<const float*>(&aToWorld.column2)),
+ load(reinterpret_cast<const float*>(&aToWorld.column3)) };
+
+ Simd4f impulseInvScale = recip(Simd4f(simd4f(cBegin[clothIndex].mImpulseScale)));
+
+ for(uint32_t k = 0; k < a.mNumParticles; ++k)
+ {
+ Simd4f* pIt = a.mIndices ? pBegin + a.mIndices[k] : pBegin + k;
+ Simd4f* qIt = a.mIndices ? qBegin + a.mIndices[k] : qBegin + k;
+
+ const Simd4f p = *pIt;
+
+ for(const BoundingBox* oIt = overlapBounds, *oEnd = overlapBounds + numOverlaps; oIt != oEnd; ++oIt)
+ {
+ // point in box test
+ if(anyGreater(oIt->mLower, p) != 0)
+ continue;
+ if(anyGreater(p, oIt->mUpper) != 0)
+ continue;
+
+ // transform particle to world space in-place
+ // (will be transformed back after collision)
+ *pIt = transform(xform, p);
+
+ Simd4f impulse = (p - *qIt) * impulseInvScale;
+ *qIt = rotate(xform, impulse);
+
+ // update world bounds
+ bounds = expandBounds(bounds, pIt, pIt + 1);
+
+ // add particle to output arrays
+ clothIndices[numParticles] = uint16_t(clothIndex);
+ particleIndices[numParticles] = uint32_t(pIt - pBegin);
+
+ // output each particle only once
+ ++numParticles;
+ break;
+ }
+ }
+ }
+
+ allocator.deallocate(sortedIndices);
+ allocator.deallocate(overlapBounds);
+ allocator.deallocate(clothBounds);
+
+ return numParticles;
+}
+}
+
+template <typename Simd4f>
+PX_INLINE Simd4f& cloth::SwInterCollision<Simd4f>::getParticle(uint32_t index)
+{
+ PX_ASSERT(index < mNumParticles);
+
+ uint16_t clothIndex = mClothIndices[index];
+ uint32_t particleIndex = mParticleIndices[index];
+
+ PX_ASSERT(clothIndex < mNumInstances);
+
+ return reinterpret_cast<Simd4f&>(mInstances[clothIndex].mParticles[particleIndex]);
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::operator()()
+{
+ mNumTests = mNumCollisions = 0;
+
+ mClothIndices = static_cast<uint16_t*>(mAllocator.allocate(sizeof(uint16_t) * mTotalParticles));
+ mParticleIndices = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t) * mTotalParticles));
+ mOverlapMasks = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t*) * mNumInstances));
+
+ for(uint32_t k = 0; k < mNumIterations; ++k)
+ {
+ // world bounds of particles
+ BoundingBox<Simd4f> bounds = emptyBounds<Simd4f>();
+
+ // calculate potentially colliding set
+ {
+ PX_PROFILE_ZONE("cloth::SwInterCollision::BroadPhase", 0);
+
+ mNumParticles =
+ calculatePotentialColliders(mInstances, mInstances + mNumInstances, mCollisionDistance, mClothIndices,
+ mParticleIndices, bounds, mOverlapMasks, mFilter, mAllocator);
+ }
+
+ // collide
+ if(mNumParticles)
+ {
+ PX_PROFILE_ZONE("cloth::SwInterCollision::Collide", 0);
+
+ Simd4f lowerBound = bounds.mLower;
+ Simd4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon);
+
+ // sweep along longest axis
+ uint32_t sweepAxis = longestAxis(edgeLength);
+ uint32_t hashAxis0 = (sweepAxis + 1) % 3;
+ uint32_t hashAxis1 = (sweepAxis + 2) % 3;
+
+ // reserve 0, 127, and 65535 for sentinel
+ Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
+ array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
+
+ Simd4f one = gSimd4fOne;
+ Simd4f gridSize = simd4f(254.0f);
+ array(gridSize)[sweepAxis] = 65534.0f;
+
+ Simd4f gridScale = recip<1>(cellSize);
+ Simd4f gridBias = -lowerBound * gridScale + one;
+
+ void* buffer = mAllocator.allocate(getBufferSize(mNumParticles));
+
+ uint32_t* __restrict sortedIndices = reinterpret_cast<uint32_t*>(buffer);
+ uint32_t* __restrict sortedKeys = sortedIndices + mNumParticles;
+ uint32_t* __restrict keys = PxMax(sortedKeys + mNumParticles, sortedIndices + 2 * mNumParticles + 1024);
+
+ typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+ // create keys
+ for(uint32_t i = 0; i < mNumParticles; ++i)
+ {
+ // grid coordinate
+ Simd4f indexf = getParticle(i) * gridScale + gridBias;
+
+ // need to clamp index because shape collision potentially
+ // pushes particles outside of their original bounds
+ Simd4i indexi = intFloor(max(one, min(indexf, gridSize)));
+
+ const int32_t* ptr = array(indexi);
+ keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
+ }
+
+ // compute sorted keys indices
+ radixSort(keys, keys + mNumParticles, sortedIndices);
+
+ // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
+ uint32_t firstColumnSize = sortedIndices[2 * mNumParticles + 769];
+
+ // sort keys
+ for(uint32_t i = 0; i < mNumParticles; ++i)
+ sortedKeys[i] = keys[sortedIndices[i]];
+ sortedKeys[mNumParticles] = uint32_t(-1); // sentinel
+
+ // calculate the number of buckets we need to search forward
+ const Simd4i data = intFloor(gridScale * mCollisionDistance);
+ uint32_t collisionDistance = uint32_t(2 + array(data)[sweepAxis]);
+
+ // collide particles
+ collideParticles(sortedKeys, firstColumnSize, sortedIndices, mNumParticles, collisionDistance);
+
+ mAllocator.deallocate(buffer);
+ }
+
+ /*
+ // verify against brute force (disable collision response when testing)
+ uint32_t numCollisions = mNumCollisions;
+ mNumCollisions = 0;
+
+ for(uint32_t i = 0; i < mNumParticles; ++i)
+ for(uint32_t j = i+1; j < mNumParticles; ++j)
+ if (mOverlapMasks[mClothIndices[i]] & (1 << mClothIndices[j]))
+ collideParticles(getParticle(i), getParticle(j));
+
+ static uint32_t iter = 0; ++iter;
+ if(numCollisions != mNumCollisions)
+ printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions);
+ */
+
+ // transform back to local space
+ {
+ PX_PROFILE_ZONE("cloth::SwInterCollision::PostTransform", 0);
+
+ Simd4f toLocal[4], impulseScale;
+ uint16_t lastCloth = uint16_t(0xffff);
+
+ for(uint32_t i = 0; i < mNumParticles; ++i)
+ {
+ uint16_t clothIndex = mClothIndices[i];
+ const SwInterCollisionData* instance = mInstances + clothIndex;
+
+ // todo: could pre-compute these inverses
+ if(clothIndex != lastCloth)
+ {
+ const PxMat44 xform(instance->mGlobalPose.getInverse());
+
+ toLocal[0] = load(reinterpret_cast<const float*>(&xform.column0));
+ toLocal[1] = load(reinterpret_cast<const float*>(&xform.column1));
+ toLocal[2] = load(reinterpret_cast<const float*>(&xform.column2));
+ toLocal[3] = load(reinterpret_cast<const float*>(&xform.column3));
+
+ impulseScale = simd4f(instance->mImpulseScale);
+
+ lastCloth = mClothIndices[i];
+ }
+
+ uint32_t particleIndex = mParticleIndices[i];
+ Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]);
+ Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]);
+
+ particle = transform(toLocal, particle);
+ // avoid w becoming negative due to numerical inaccuracies
+ impulse = max(sZeroW, particle - rotate(toLocal, Simd4f(impulse * impulseScale)));
+ }
+ }
+ }
+
+ mAllocator.deallocate(mOverlapMasks);
+ mAllocator.deallocate(mParticleIndices);
+ mAllocator.deallocate(mClothIndices);
+}
+
+template <typename Simd4f>
+size_t cloth::SwInterCollision<Simd4f>::estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n)
+{
+ // count total particles
+ uint32_t numParticles = 0;
+ for(uint32_t i = 0; i < n; ++i)
+ numParticles += cloths[i].mNumParticles;
+
+ uint32_t boundsSize = 2 * n * sizeof(BoundingBox<Simd4f>) + n * sizeof(uint32_t);
+ uint32_t clothIndicesSize = numParticles * sizeof(uint16_t);
+ uint32_t particleIndicesSize = numParticles * sizeof(uint32_t);
+ uint32_t masksSize = n * sizeof(uint32_t);
+
+ return boundsSize + clothIndicesSize + particleIndicesSize + masksSize + getBufferSize(numParticles);
+}
+
+template <typename Simd4f>
+size_t physx::cloth::SwInterCollision<Simd4f>::getBufferSize(uint32_t numParticles)
+{
+ uint32_t keysSize = numParticles * sizeof(uint32_t);
+ uint32_t indicesSize = numParticles * sizeof(uint32_t);
+ uint32_t histogramSize = 1024 * sizeof(uint32_t);
+
+ return keysSize + indicesSize + PxMax(indicesSize + histogramSize, keysSize);
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::collideParticle(uint32_t index)
+{
+ uint16_t clothIndex = mClothIndices[index];
+
+ if((1 << clothIndex) & ~mClothMask)
+ return;
+
+ const SwInterCollisionData* instance = mInstances + clothIndex;
+
+ uint32_t particleIndex = mParticleIndices[index];
+ Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]);
+
+ Simd4f diff = particle - mParticle;
+ Simd4f distSqr = dot3(diff, diff);
+
+#if PX_DEBUG
+ ++mNumTests;
+#endif
+
+ if(allGreater(distSqr, mCollisionSquareDistance))
+ return;
+
+ Simd4f w0 = splat<3>(mParticle);
+ Simd4f w1 = splat<3>(particle);
+
+ Simd4f ratio = mCollisionDistance * rsqrt<1>(distSqr);
+ Simd4f scale = mStiffness * recip<1>(sEpsilon + w0 + w1);
+ Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+
+ mParticle = mParticle + delta * w0;
+ particle = particle - delta * w1;
+
+ Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]);
+
+ mImpulse = mImpulse + delta * w0;
+ impulse = impulse - delta * w1;
+
+#if PX_DEBUG || PX_PROFILE
+ ++mNumCollisions;
+#endif
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::collideParticles(const uint32_t* keys, uint32_t firstColumnSize,
+ const uint32_t* indices, uint32_t numParticles,
+ uint32_t collisionDistance)
+{
+ const uint32_t bucketMask = uint16_t(-1);
+
+ const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
+
+ const uint32_t* __restrict kFirst[5];
+ const uint32_t* __restrict kLast[5];
+
+ {
+ // optimization: scan forward iterator starting points once instead of 9 times
+ const uint32_t* __restrict kIt = keys;
+
+ uint32_t key = *kIt;
+ uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+ uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+ kFirst[0] = kIt;
+ while(*kIt < lastKey)
+ ++kIt;
+ kLast[0] = kIt;
+
+ for(uint32_t k = 1; k < 5; ++k)
+ {
+ for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
+ ++kIt;
+ kFirst[k] = kIt;
+
+ for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
+ ++kIt;
+ kLast[k] = kIt;
+
+ // jump forward once to second column
+ kIt = keys + firstColumnSize;
+ firstColumnSize = 0;
+ }
+ }
+
+ const uint32_t* __restrict iIt = indices;
+ const uint32_t* __restrict iEnd = indices + numParticles;
+
+ const uint32_t* __restrict jIt;
+ const uint32_t* __restrict jEnd;
+
+ for(; iIt != iEnd; ++iIt, ++kFirst[0])
+ {
+ // load current particle once outside of inner loop
+ uint32_t index = *iIt;
+ PX_ASSERT(index < mNumParticles);
+ mClothIndex = mClothIndices[index];
+ PX_ASSERT(mClothIndex < mNumInstances);
+ mClothMask = mOverlapMasks[mClothIndex];
+
+ const SwInterCollisionData* instance = mInstances + mClothIndex;
+
+ mParticleIndex = mParticleIndices[index];
+ mParticle = reinterpret_cast<const Simd4f&>(instance->mParticles[mParticleIndex]);
+ mImpulse = reinterpret_cast<const Simd4f&>(instance->mPrevParticles[mParticleIndex]);
+
+ uint32_t key = *kFirst[0];
+
+ // range of keys we need to check against for this particle
+ uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+ uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+ // scan forward end point
+ while(*kLast[0] < lastKey)
+ ++kLast[0];
+
+ // process potential colliders of same cell
+ jEnd = indices + (kLast[0] - keys);
+ for(jIt = iIt + 1; jIt != jEnd; ++jIt)
+ collideParticle(*jIt);
+
+ // process neighbor cells
+ for(uint32_t k = 1; k < 5; ++k)
+ {
+ // scan forward start point
+ for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;)
+ ++kFirst[k];
+
+ // scan forward end point
+ for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;)
+ ++kLast[k];
+
+ // process potential colliders
+ jEnd = indices + (kLast[k] - keys);
+ for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+ collideParticle(*jIt);
+ }
+
+ // write back particle and impulse
+ reinterpret_cast<Simd4f&>(instance->mParticles[mParticleIndex]) = mParticle;
+ reinterpret_cast<Simd4f&>(instance->mPrevParticles[mParticleIndex]) = mImpulse;
+ }
+}
+
+// explicit template instantiation
+#if NV_SIMD_SIMD
+template class cloth::SwInterCollision<Simd4f>;
+#endif
+#if NV_SIMD_SCALAR
+template class cloth::SwInterCollision<Scalar4f>;
+#endif
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.h
new file mode 100644
index 00000000..7488f62c
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.h
@@ -0,0 +1,144 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "StackAllocator.h"
+#include "Simd.h"
+
+#include "foundation/PxMat44.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxBounds3.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+typedef bool (*InterCollisionFilter)(void* cloth0, void* cloth1);
+
+struct SwInterCollisionData
+{
+ SwInterCollisionData()
+ {
+ }
+ SwInterCollisionData(PxVec4* particles, PxVec4* prevParticles, uint32_t numParticles, uint32_t* indices,
+ const PxTransform& globalPose, const PxVec3& boundsCenter, const PxVec3& boundsHalfExtents,
+ float impulseScale, void* userData)
+ : mParticles(particles)
+ , mPrevParticles(prevParticles)
+ , mNumParticles(numParticles)
+ , mIndices(indices)
+ , mGlobalPose(globalPose)
+ , mBoundsCenter(boundsCenter)
+ , mBoundsHalfExtent(boundsHalfExtents)
+ , mImpulseScale(impulseScale)
+ , mUserData(userData)
+ {
+ }
+
+ PxVec4* mParticles;
+ PxVec4* mPrevParticles;
+ uint32_t mNumParticles;
+ uint32_t* mIndices;
+ PxTransform mGlobalPose;
+ PxVec3 mBoundsCenter;
+ PxVec3 mBoundsHalfExtent;
+ float mImpulseScale;
+ void* mUserData;
+};
+
+template <typename Simd4f>
+class SwInterCollision
+{
+
+ public:
+ SwInterCollision(const SwInterCollisionData* cloths, uint32_t n, float colDist, float stiffness,
+ uint32_t iterations, InterCollisionFilter filter, cloth::SwKernelAllocator& alloc);
+
+ ~SwInterCollision();
+
+ void operator()();
+
+ static size_t estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n);
+
+ private:
+ SwInterCollision& operator=(const SwInterCollision&); // not implemented
+
+ static size_t getBufferSize(uint32_t);
+
+ void collideParticles(const uint32_t* keys, uint32_t firstColumnSize, const uint32_t* sortedIndices,
+ uint32_t numParticles, uint32_t collisionDistance);
+
+ Simd4f& getParticle(uint32_t index);
+
+ // better wrap these in a struct
+ void collideParticle(uint32_t index);
+
+ Simd4f mParticle;
+ Simd4f mImpulse;
+
+ Simd4f mCollisionDistance;
+ Simd4f mCollisionSquareDistance;
+ Simd4f mStiffness;
+
+ uint16_t mClothIndex;
+ uint32_t mClothMask;
+ uint32_t mParticleIndex;
+
+ uint32_t mNumIterations;
+
+ const SwInterCollisionData* mInstances;
+ uint32_t mNumInstances;
+
+ uint16_t* mClothIndices;
+ uint32_t* mParticleIndices;
+ uint32_t mNumParticles;
+ uint32_t* mOverlapMasks;
+
+ uint32_t mTotalParticles;
+
+ InterCollisionFilter mFilter;
+
+ SwKernelAllocator& mAllocator;
+
+ public:
+ mutable uint32_t mNumTests;
+ mutable uint32_t mNumCollisions;
+};
+
+} // namespace cloth
+
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.cpp
new file mode 100644
index 00000000..122de902
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.cpp
@@ -0,0 +1,426 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMemory.h"
+#include "SwSelfCollision.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "SwCollisionHelpers.h"
+
+using namespace physx;
+
+namespace
+{
+
+const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+
+// returns sorted indices, output needs to be at least 2*(last-first)+1024
+void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
+{
+ uint16_t n = uint16_t(last - first);
+
+ uint16_t* buffer = out + 2 * n;
+ uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
+
+ PxMemZero(buffer, 1024 * sizeof(uint16_t));
+
+ // build 3 histograms in one pass
+ for(const uint32_t* __restrict it = first; it != last; ++it)
+ {
+ uint32_t key = *it;
+ ++histograms[0][0xff & key];
+ ++histograms[1][0xff & (key >> 8)];
+ ++histograms[2][0xff & (key >> 16)];
+ ++histograms[3][key >> 24];
+ }
+
+ // convert histograms to offset tables in-place
+ uint16_t sums[4] = {};
+ for(uint32_t i = 0; i < 256; ++i)
+ {
+ uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]);
+ histograms[0][i] = sums[0];
+ sums[0] = temp0;
+
+ uint16_t temp1 = uint16_t(histograms[1][i] + sums[1]);
+ histograms[1][i] = sums[1];
+ sums[1] = temp1;
+
+ uint16_t temp2 = uint16_t(histograms[2][i] + sums[2]);
+ histograms[2][i] = sums[2];
+ sums[2] = temp2;
+
+ uint16_t temp3 = uint16_t(histograms[3][i] + sums[3]);
+ histograms[3][i] = sums[3];
+ sums[3] = temp3;
+ }
+
+ PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n);
+
+#if PX_DEBUG
+ memset(out, 0xff, 2 * n * sizeof(uint16_t));
+#endif
+
+ // sort 8 bits per pass
+
+ uint16_t* __restrict indices[] = { out, out + n };
+
+ for(uint16_t i = 0; i != n; ++i)
+ indices[1][histograms[0][0xff & first[i]]++] = i;
+
+ for(uint16_t i = 0, index; i != n; ++i)
+ {
+ index = indices[1][i];
+ indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index;
+ }
+
+ for(uint16_t i = 0, index; i != n; ++i)
+ {
+ index = indices[0][i];
+ indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index;
+ }
+
+ for(uint16_t i = 0, index; i != n; ++i)
+ {
+ index = indices[1][i];
+ indices[0][histograms[3][first[index] >> 24]++] = index;
+ }
+}
+
+template <typename Simd4f>
+uint32_t longestAxis(const Simd4f& edgeLength)
+{
+ const float* e = array(edgeLength);
+
+ if(e[0] > e[1])
+ return uint32_t(e[0] > e[2] ? 0 : 2);
+ else
+ return uint32_t(e[1] > e[2] ? 1 : 2);
+}
+
+bool isSelfCollisionEnabled(const cloth::SwClothData& cloth)
+{
+ return PxMin(cloth.mSelfCollisionDistance, cloth.mSelfCollisionStiffness) > 0.0f;
+}
+
+bool isSelfCollisionEnabled(const cloth::SwCloth& cloth)
+{
+ return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f;
+}
+
+inline uint32_t align2(uint32_t x)
+{
+ return (x + 1) & ~1;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+cloth::SwSelfCollision<Simd4f>::SwSelfCollision(cloth::SwClothData& clothData, cloth::SwKernelAllocator& alloc)
+: mClothData(clothData), mAllocator(alloc)
+{
+ mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance);
+ mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
+ mStiffness = sMaskXYZ & static_cast<Simd4f>(simd4f(mClothData.mSelfCollisionStiffness));
+}
+
+template <typename Simd4f>
+cloth::SwSelfCollision<Simd4f>::~SwSelfCollision()
+{
+}
+
+template <typename Simd4f>
+void cloth::SwSelfCollision<Simd4f>::operator()()
+{
+ mNumTests = mNumCollisions = 0;
+
+ if(!isSelfCollisionEnabled(mClothData))
+ return;
+
+ Simd4f lowerBound = load(mClothData.mCurBounds);
+ Simd4f edgeLength = max(load(mClothData.mCurBounds + 3) - lowerBound, gSimd4fEpsilon);
+
+ // sweep along longest axis
+ uint32_t sweepAxis = longestAxis(edgeLength);
+ uint32_t hashAxis0 = (sweepAxis + 1) % 3;
+ uint32_t hashAxis1 = (sweepAxis + 2) % 3;
+
+ // reserve 0, 127, and 65535 for sentinel
+ Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
+ array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
+
+ Simd4f one = gSimd4fOne;
+ Simd4f gridSize = simd4f(254.0f);
+ array(gridSize)[sweepAxis] = 65534.0f;
+
+ Simd4f gridScale = recip<1>(cellSize);
+ Simd4f gridBias = -lowerBound * gridScale + one;
+
+ uint32_t numIndices = mClothData.mNumSelfCollisionIndices;
+ void* buffer = mAllocator.allocate(getBufferSize(numIndices));
+
+ const uint32_t* __restrict indices = mClothData.mSelfCollisionIndices;
+ uint32_t* __restrict keys = reinterpret_cast<uint32_t*>(buffer);
+ uint16_t* __restrict sortedIndices = reinterpret_cast<uint16_t*>(keys + numIndices);
+ uint32_t* __restrict sortedKeys = reinterpret_cast<uint32_t*>(sortedIndices + align2(numIndices));
+
+ const Simd4f* particles = reinterpret_cast<const Simd4f*>(mClothData.mCurParticles);
+
+ // create keys
+ for(uint32_t i = 0; i < numIndices; ++i)
+ {
+ uint32_t index = indices ? indices[i] : i;
+
+ // grid coordinate
+ Simd4f keyf = particles[index] * gridScale + gridBias;
+
+ // need to clamp index because shape collision potentially
+ // pushes particles outside of their original bounds
+ Simd4i keyi = intFloor(max(one, min(keyf, gridSize)));
+
+ const int32_t* ptr = array(keyi);
+ keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
+ }
+
+ // compute sorted keys indices
+ radixSort(keys, keys + numIndices, sortedIndices);
+
+ // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
+ uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769];
+
+ // sort keys
+ for(uint32_t i = 0; i < numIndices; ++i)
+ sortedKeys[i] = keys[sortedIndices[i]];
+ sortedKeys[numIndices] = uint32_t(-1); // sentinel
+
+ if(indices)
+ {
+ // sort indices (into no-longer-needed keys array)
+ const uint16_t* __restrict permutation = sortedIndices;
+ sortedIndices = reinterpret_cast<uint16_t*>(keys);
+ for(uint32_t i = 0; i < numIndices; ++i)
+ sortedIndices[i] = uint16_t(indices[permutation[i]]);
+ }
+
+ // calculate the number of buckets we need to search forward
+ const Simd4i data = intFloor(gridScale * mCollisionDistance);
+ uint32_t collisionDistance = 2 + static_cast<uint32_t>(array(data)[sweepAxis]);
+
+ // collide particles
+ if(mClothData.mRestPositions)
+ collideParticles<true>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance);
+ else
+ collideParticles<false>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance);
+
+ mAllocator.deallocate(buffer);
+
+ // verify against brute force (disable collision response when testing)
+ /*
+ uint32_t numCollisions = mNumCollisions;
+ mNumCollisions = 0;
+
+ Simd4f* qarticles = reinterpret_cast<
+ Simd4f*>(mClothData.mCurParticles);
+ for(uint32_t i = 0; i < numIndices; ++i)
+ {
+ uint32_t indexI = indices ? indices[i] : i;
+ for(uint32_t j = i+1; j < numIndices; ++j)
+ {
+ uint32_t indexJ = indices ? indices[j] : j;
+ collideParticles(qarticles[indexI], qarticles[indexJ]);
+ }
+ }
+
+ static uint32_t iter = 0; ++iter;
+ if(numCollisions != mNumCollisions)
+ printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions);
+ */
+}
+
+template <typename Simd4f>
+size_t cloth::SwSelfCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+ uint32_t numIndices =
+ cloth.mSelfCollisionIndices.empty() ? cloth.mCurParticles.size() : cloth.mSelfCollisionIndices.size();
+ return isSelfCollisionEnabled(cloth) ? getBufferSize(numIndices) : 0;
+}
+
+template <typename Simd4f>
+size_t physx::cloth::SwSelfCollision<Simd4f>::getBufferSize(uint32_t numIndices)
+{
+ uint32_t keysSize = numIndices * sizeof(uint32_t);
+ uint32_t indicesSize = align2(numIndices) * sizeof(uint16_t);
+ uint32_t radixSize = (numIndices + 1024) * sizeof(uint16_t);
+ return keysSize + indicesSize + PxMax(radixSize, keysSize + uint32_t(sizeof(uint32_t)));
+}
+
+template <typename Simd4f>
+template <bool useRestParticles>
+void cloth::SwSelfCollision<Simd4f>::collideParticles(Simd4f& pos0, Simd4f& pos1, const Simd4f& pos0rest,
+ const Simd4f& pos1rest)
+{
+ Simd4f diff = pos1 - pos0;
+ Simd4f distSqr = dot3(diff, diff);
+
+#if PX_DEBUG
+ ++mNumTests;
+#endif
+
+ if(allGreater(distSqr, mCollisionSquareDistance))
+ return;
+
+ if(useRestParticles)
+ {
+ // calculate distance in rest configuration, if less than collision
+ // distance then ignore collision between particles in deformed config
+ Simd4f restDiff = pos1rest - pos0rest;
+ Simd4f restDistSqr = dot3(restDiff, restDiff);
+
+ if(allGreater(mCollisionSquareDistance, restDistSqr))
+ return;
+ }
+
+ Simd4f w0 = splat<3>(pos0);
+ Simd4f w1 = splat<3>(pos1);
+
+ Simd4f ratio = mCollisionDistance * rsqrt(distSqr);
+ Simd4f scale = mStiffness * recip(gSimd4fEpsilon + w0 + w1);
+ Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+
+ pos0 = pos0 + delta * w0;
+ pos1 = pos1 - delta * w1;
+
+#if PX_DEBUG || PX_PROFILE
+ ++mNumCollisions;
+#endif
+}
+
+template <typename Simd4f>
+template <bool useRestParticles>
+void cloth::SwSelfCollision<Simd4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize,
+ const uint16_t* indices, uint32_t collisionDistance)
+{
+ Simd4f* __restrict particles = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ Simd4f* __restrict restParticles =
+ useRestParticles ? reinterpret_cast<Simd4f*>(mClothData.mRestPositions) : particles;
+
+ const uint32_t bucketMask = uint16_t(-1);
+
+ const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
+
+ const uint32_t* __restrict kFirst[5];
+ const uint32_t* __restrict kLast[5];
+
+ {
+ // optimization: scan forward iterator starting points once instead of 9 times
+ const uint32_t* __restrict kIt = keys;
+
+ uint32_t key = *kIt;
+ uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+ uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+ kFirst[0] = kIt;
+ while(*kIt < lastKey)
+ ++kIt;
+ kLast[0] = kIt;
+
+ for(uint32_t k = 1; k < 5; ++k)
+ {
+ for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
+ ++kIt;
+ kFirst[k] = kIt;
+
+ for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
+ ++kIt;
+ kLast[k] = kIt;
+
+ // jump forward once to second column
+ kIt = keys + firstColumnSize;
+ firstColumnSize = 0;
+ }
+ }
+
+ const uint16_t* __restrict iIt = indices;
+ const uint16_t* __restrict iEnd = indices + mClothData.mNumSelfCollisionIndices;
+
+ const uint16_t* __restrict jIt;
+ const uint16_t* __restrict jEnd;
+
+ for(; iIt != iEnd; ++iIt, ++kFirst[0])
+ {
+ PX_ASSERT(*iIt < mClothData.mNumParticles);
+
+ // load current particle once outside of inner loop
+ Simd4f particle = particles[*iIt];
+ Simd4f restParticle = restParticles[*iIt];
+
+ uint32_t key = *kFirst[0];
+
+ // range of keys we need to check against for this particle
+ uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+ uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+ // scan forward end point
+ while(*kLast[0] < lastKey)
+ ++kLast[0];
+
+ // process potential colliders of same cell
+ jEnd = indices + (kLast[0] - keys);
+ for(jIt = iIt + 1; jIt != jEnd; ++jIt)
+ collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
+
+ // process neighbor cells
+ for(uint32_t k = 1; k < 5; ++k)
+ {
+ // scan forward start point
+ for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;)
+ ++kFirst[k];
+
+ // scan forward end point
+ for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;)
+ ++kLast[k];
+
+ // process potential colliders
+ jEnd = indices + (kLast[k] - keys);
+ for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+ collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
+ }
+
+ // store current particle
+ particles[*iIt] = particle;
+ }
+}
+
+// explicit template instantiation
+#if NV_SIMD_SIMD
+template class cloth::SwSelfCollision<Simd4f>;
+#endif
+#if NV_SIMD_SCALAR
+template class cloth::SwSelfCollision<Scalar4f>;
+#endif
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.h
new file mode 100644
index 00000000..eabeb1ee
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.h
@@ -0,0 +1,83 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "StackAllocator.h"
+#include "Simd.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+template <typename Simd4f>
+class SwSelfCollision
+{
+ typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+ public:
+ SwSelfCollision(SwClothData& clothData, SwKernelAllocator& alloc);
+ ~SwSelfCollision();
+
+ void operator()();
+
+ static size_t estimateTemporaryMemory(const SwCloth&);
+
+ private:
+ SwSelfCollision& operator=(const SwSelfCollision&); // not implemented
+ static size_t getBufferSize(uint32_t);
+
+ template <bool useRestParticles>
+ void collideParticles(Simd4f&, Simd4f&, const Simd4f&, const Simd4f&);
+
+ template <bool useRestParticles>
+ void collideParticles(const uint32_t*, uint16_t, const uint16_t*, uint32_t);
+
+ Simd4f mCollisionDistance;
+ Simd4f mCollisionSquareDistance;
+ Simd4f mStiffness;
+
+ SwClothData& mClothData;
+ SwKernelAllocator& mAllocator;
+
+ public:
+ mutable uint32_t mNumTests;
+ mutable uint32_t mNumCollisions;
+};
+
+} // namespace cloth
+
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.cpp
new file mode 100644
index 00000000..65a4b6c6
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.cpp
@@ -0,0 +1,294 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "SwSolver.h"
+#include "SwCloth.h"
+#include "ClothImpl.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "SwClothData.h"
+#include "SwSolverKernel.h"
+#include "SwInterCollision.h"
+#include "PsFPU.h"
+#include "PsFoundation.h"
+#include "PsSort.h"
+
+namespace physx
+{
+namespace cloth
+{
+bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&);
+}
+}
+
+using namespace physx;
+
+#if NV_SIMD_SIMD
+typedef Simd4f Simd4fType;
+#else
+typedef Scalar4f Simd4fType;
+#endif
+
+cloth::SwSolver::SwSolver(physx::PxTaskManager* taskMgr)
+: mInterCollisionDistance(0.0f)
+, mInterCollisionStiffness(1.0f)
+, mInterCollisionIterations(1)
+, mInterCollisionScratchMem(NULL)
+, mInterCollisionScratchMemSize(0)
+{
+ mStartSimulationTask.mSolver = this;
+ mEndSimulationTask.mSolver = this;
+
+ PX_UNUSED(taskMgr);
+}
+
+cloth::SwSolver::~SwSolver()
+{
+ if(mInterCollisionScratchMem)
+ PX_FREE(mInterCollisionScratchMem);
+
+ PX_ASSERT(mCpuClothSimulationTasks.empty());
+}
+
+namespace
+{
+template <typename T>
+bool clothSizeGreater(const T& t0, const T& t1)
+{
+ return t0.mCloth->mCurParticles.size() > t1.mCloth->mCurParticles.size();
+}
+
+template <typename T>
+void sortTasks(shdfnd::Array<T, physx::shdfnd::NonTrackingAllocator>& tasks)
+{
+ shdfnd::sort(tasks.begin(), tasks.size(), &clothSizeGreater<T>);
+}
+}
+
+void cloth::SwSolver::addCloth(Cloth* cloth)
+{
+ SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth;
+
+ mCpuClothSimulationTasks.pushBack(CpuClothSimulationTask(swCloth, mEndSimulationTask));
+
+ sortTasks(mCpuClothSimulationTasks);
+}
+
+void cloth::SwSolver::removeCloth(Cloth* cloth)
+{
+ SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth;
+
+ CpuClothSimulationTaskVector::Iterator tIt = mCpuClothSimulationTasks.begin();
+ CpuClothSimulationTaskVector::Iterator tEnd = mCpuClothSimulationTasks.end();
+ while(tIt != tEnd && tIt->mCloth != &swCloth)
+ ++tIt;
+
+ if(tIt != tEnd)
+ {
+ deallocate(tIt->mScratchMemory);
+ mCpuClothSimulationTasks.replaceWithLast(tIt);
+ sortTasks(mCpuClothSimulationTasks);
+ }
+}
+
+physx::PxBaseTask& cloth::SwSolver::simulate(float dt, physx::PxBaseTask& continuation)
+{
+ if(mCpuClothSimulationTasks.empty())
+ {
+ continuation.addReference();
+ return continuation;
+ }
+
+ mEndSimulationTask.setContinuation(&continuation);
+ mEndSimulationTask.mDt = dt;
+
+ mStartSimulationTask.setContinuation(&mEndSimulationTask);
+
+ mEndSimulationTask.removeReference();
+
+ return mStartSimulationTask;
+}
+
+void cloth::SwSolver::interCollision()
+{
+ if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f)
+ return;
+
+ float elasticity = 1.0f;
+
+ // rebuild cloth instance array
+ mInterCollisionInstances.resize(0);
+ for(uint32_t i = 0; i < mCpuClothSimulationTasks.size(); ++i)
+ {
+ SwCloth* c = mCpuClothSimulationTasks[i].mCloth;
+ float invNumIterations = mCpuClothSimulationTasks[i].mInvNumIterations;
+
+ mInterCollisionInstances.pushBack(SwInterCollisionData(
+ c->mCurParticles.begin(), c->mPrevParticles.begin(),
+ c->mSelfCollisionIndices.empty() ? c->mCurParticles.size() : c->mSelfCollisionIndices.size(),
+ c->mSelfCollisionIndices.empty() ? NULL : &c->mSelfCollisionIndices[0], c->mTargetMotion,
+ c->mParticleBoundsCenter, c->mParticleBoundsHalfExtent, elasticity * invNumIterations, c->mUserData));
+ }
+
+ const uint32_t requiredTempMemorySize = uint32_t(SwInterCollision<Simd4fType>::estimateTemporaryMemory(
+ &mInterCollisionInstances[0], mInterCollisionInstances.size()));
+
+ // realloc temp memory if necessary
+ if(mInterCollisionScratchMemSize < requiredTempMemorySize)
+ {
+ if(mInterCollisionScratchMem)
+ PX_FREE(mInterCollisionScratchMem);
+
+ mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem");
+ mInterCollisionScratchMemSize = requiredTempMemorySize;
+ }
+
+ SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize);
+
+ // run inter-collision
+ SwInterCollision<Simd4fType> collider(mInterCollisionInstances.begin(), mInterCollisionInstances.size(),
+ mInterCollisionDistance, mInterCollisionStiffness, mInterCollisionIterations,
+ mInterCollisionFilter, allocator);
+
+ collider();
+}
+
+void cloth::SwSolver::beginFrame() const
+{
+ PX_PROFILE_START_CROSSTHREAD("cloth::SwSolver::simulate", 0);
+}
+
+void cloth::SwSolver::endFrame() const
+{
+ PX_PROFILE_STOP_CROSSTHREAD("cloth::SwSolver::simulate", 0);
+}
+
+void cloth::SwSolver::StartSimulationTask::runInternal()
+{
+ mSolver->beginFrame();
+
+ CpuClothSimulationTaskVector::Iterator tIt = mSolver->mCpuClothSimulationTasks.begin();
+ CpuClothSimulationTaskVector::Iterator tEnd = mSolver->mCpuClothSimulationTasks.end();
+
+ for(; tIt != tEnd; ++tIt)
+ {
+ if(!tIt->mCloth->isSleeping())
+ {
+ tIt->setContinuation(mCont);
+ tIt->removeReference();
+ }
+ }
+}
+
+const char* cloth::SwSolver::StartSimulationTask::getName() const
+{
+ return "cloth.SwSolver.startSimulation";
+}
+
+void cloth::SwSolver::EndSimulationTask::runInternal()
+{
+ mSolver->interCollision();
+ mSolver->endFrame();
+}
+
+const char* cloth::SwSolver::EndSimulationTask::getName() const
+{
+ return "cloth.SwSolver.endSimulation";
+}
+
+cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, EndSimulationTask& continuation)
+: mCloth(&cloth), mContinuation(&continuation), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f)
+{
+}
+
+void cloth::SwSolver::CpuClothSimulationTask::runInternal()
+{
+ // check if we need to reallocate the temp memory buffer
+ // (number of shapes may have changed)
+ uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth));
+
+ if(mScratchMemorySize < requiredTempMemorySize)
+ {
+ deallocate(mScratchMemory);
+
+ mScratchMemory = allocate(requiredTempMemorySize);
+ mScratchMemorySize = requiredTempMemorySize;
+ }
+
+ if(mContinuation->mDt == 0.0f)
+ return;
+
+ IterationStateFactory factory(*mCloth, mContinuation->mDt);
+ mInvNumIterations = factory.mInvNumIterations;
+
+ shdfnd::SIMDGuard simdGuard;
+
+ SwClothData data(*mCloth, mCloth->mFabric);
+ SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize));
+
+// construct kernel functor and execute
+#if PX_ANDROID
+// if(!neonSolverKernel(cloth, data, allocator, factory))
+#endif
+ SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory)();
+
+ data.reconcile(*mCloth); // update cloth
+}
+
+const char* cloth::SwSolver::CpuClothSimulationTask::getName() const
+{
+ return "cloth.SwSolver.cpuClothSimulation";
+}
+
+void cloth::SwSolver::CpuClothSimulationTask::release()
+{
+ mCloth->mMotionConstraints.pop();
+ mCloth->mSeparationConstraints.pop();
+
+ if(!mCloth->mTargetCollisionSpheres.empty())
+ {
+ swap(mCloth->mStartCollisionSpheres, mCloth->mTargetCollisionSpheres);
+ mCloth->mTargetCollisionSpheres.resize(0);
+ }
+
+ if(!mCloth->mTargetCollisionPlanes.empty())
+ {
+ swap(mCloth->mStartCollisionPlanes, mCloth->mTargetCollisionPlanes);
+ mCloth->mTargetCollisionPlanes.resize(0);
+ }
+
+ if(!mCloth->mTargetCollisionTriangles.empty())
+ {
+ swap(mCloth->mStartCollisionTriangles, mCloth->mTargetCollisionTriangles);
+ mCloth->mTargetCollisionTriangles.resize(0);
+ }
+
+ mContinuation->removeReference();
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.h b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.h
new file mode 100644
index 00000000..5e1fe975
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.h
@@ -0,0 +1,153 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Solver.h"
+#include "Allocator.h"
+#include "SwInterCollision.h"
+#include "CmTask.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class SwCloth;
+class SwFactory;
+
+/// CPU/SSE based cloth solver
+class SwSolver : public UserAllocated, public Solver
+{
+ struct StartSimulationTask : public Cm::Task
+ {
+ using physx::PxLightCpuTask::mRefCount;
+ using physx::PxLightCpuTask::mTm;
+
+ virtual void runInternal();
+ virtual const char* getName() const;
+ SwSolver* mSolver;
+ };
+
+ struct EndSimulationTask : public Cm::Task
+ {
+ using physx::PxLightCpuTask::mRefCount;
+
+ virtual void runInternal();
+ virtual const char* getName() const;
+ SwSolver* mSolver;
+ float mDt;
+ };
+
+ struct CpuClothSimulationTask : public Cm::Task
+ {
+ CpuClothSimulationTask(SwCloth&, EndSimulationTask&);
+ virtual void runInternal();
+ virtual const char* getName() const;
+ virtual void release();
+
+ SwCloth* mCloth;
+ EndSimulationTask* mContinuation;
+ uint32_t mScratchMemorySize;
+ void* mScratchMemory;
+ float mInvNumIterations;
+ };
+
+ public:
+ SwSolver(physx::PxTaskManager*);
+ virtual ~SwSolver();
+
+ virtual void addCloth(Cloth*);
+ virtual void removeCloth(Cloth*);
+
+ virtual physx::PxBaseTask& simulate(float dt, physx::PxBaseTask&);
+
+ virtual void setInterCollisionDistance(float distance)
+ {
+ mInterCollisionDistance = distance;
+ }
+ virtual float getInterCollisionDistance() const
+ {
+ return mInterCollisionDistance;
+ }
+
+ virtual void setInterCollisionStiffness(float stiffness)
+ {
+ mInterCollisionStiffness = stiffness;
+ }
+ virtual float getInterCollisionStiffness() const
+ {
+ return mInterCollisionStiffness;
+ }
+
+ virtual void setInterCollisionNbIterations(uint32_t nbIterations)
+ {
+ mInterCollisionIterations = nbIterations;
+ }
+ virtual uint32_t getInterCollisionNbIterations() const
+ {
+ return mInterCollisionIterations;
+ }
+
+ virtual void setInterCollisionFilter(InterCollisionFilter filter)
+ {
+ mInterCollisionFilter = filter;
+ }
+
+ virtual bool hasError() const
+ {
+ return false;
+ }
+
+ private:
+ void beginFrame() const;
+ void endFrame() const;
+
+ void interCollision();
+
+ private:
+ StartSimulationTask mStartSimulationTask;
+
+ typedef Vector<CpuClothSimulationTask>::Type CpuClothSimulationTaskVector;
+ CpuClothSimulationTaskVector mCpuClothSimulationTasks;
+
+ EndSimulationTask mEndSimulationTask;
+
+ float mInterCollisionDistance;
+ float mInterCollisionStiffness;
+ uint32_t mInterCollisionIterations;
+ InterCollisionFilter mInterCollisionFilter;
+
+ void* mInterCollisionScratchMem;
+ uint32_t mInterCollisionScratchMemSize;
+ shdfnd::Array<SwInterCollisionData> mInterCollisionInstances;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.cpp
new file mode 100644
index 00000000..bf5d86a1
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.cpp
@@ -0,0 +1,781 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "SwSolverKernel.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "PointInterpolator.h"
+#include "BoundingBox.h"
+
+#define PX_AVX (NV_SIMD_SIMD&&(PX_WIN32 || PX_WIN64) && PX_VC >= 10)
+
+#if PX_AVX
+namespace avx
+{
+// defined in SwSolveConstraints.cpp
+
+void initialize();
+
+template <bool, uint32_t>
+void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict,
+ const __m128&);
+}
+
+namespace
+{
+uint32_t getAvxSupport()
+{
+// Checking for AVX requires 3 things:
+// 1) CPUID indicates that the OS uses XSAVE and XRSTORE
+// 2) CPUID indicates support for AVX
+// 3) XGETBV indicates registers are saved and restored on context switch
+
+#if _MSC_FULL_VER < 160040219 || !defined(_XCR_XFEATURE_ENABLED_MASK)
+ // need at least VC10 SP1 and compile on at least Win7 SP1
+ return 0;
+#else
+ int cpuInfo[4];
+ __cpuid(cpuInfo, 1);
+ int avxFlags = 3 << 27; // checking 1) and 2) above
+ if((cpuInfo[2] & avxFlags) != avxFlags)
+ return 0; // xgetbv not enabled or no AVX support
+
+ if((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) != 0x6)
+ return 0; // OS does not save YMM registers
+
+ avx::initialize();
+
+#if _MSC_VER < 1700
+ return 1;
+#else
+ int fmaFlags = 1 << 12;
+ if((cpuInfo[2] & fmaFlags) != fmaFlags)
+ return 1; // no FMA3 support
+
+ /* only using fma at the moment, don't lock out AMD's piledriver by requiring avx2
+ __cpuid(cpuInfo, 7);
+ int avx2Flags = 1 << 5;
+ if((cpuInfo[1] & avx2Flags) != avx2Flags)
+ return 1; // no AVX2 support
+ */
+
+ return 2;
+#endif // _MSC_VER
+#endif // _MSC_FULL_VER
+}
+
+const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA
+}
+#endif
+
+using namespace physx;
+
+namespace
+{
+/* simd constants */
+
+const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fTupleFactory sMaskXY = simd4f(simd4i(~0, ~0, 0, 0));
+const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fTupleFactory sMaskYZW = simd4f(simd4i(0, ~0, ~0, ~0));
+const Simd4fTupleFactory sMinusOneXYZOneW = simd4f(-1.0f, -1.0f, -1.0f, 1.0f);
+const Simd4fTupleFactory sFloatMaxW = simd4f(0.0f, 0.0f, 0.0f, FLT_MAX);
+const Simd4fTupleFactory sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+/* static worker functions */
+
+/**
+ This function performs explicit Euler integration based on position, where
+ x_next = x_cur + (x_cur - x_prev) * dt_cur/dt_prev * damping + g * dt * dt
+ The g * dt * dt term is folded into accelIt.
+ */
+
+template <typename Simd4f, typename AccelerationIterator>
+void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt,
+ const Simd4f& scale, const AccelerationIterator& aIt, const Simd4f& prevBias)
+{
+ // local copy to avoid LHS
+ AccelerationIterator accelIt(aIt);
+
+ for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt)
+ {
+ Simd4f current = *curIt;
+ Simd4f previous = *prevIt;
+ // if(current.w == 0) current.w = previous.w
+ current = select(current > sMinusFloatMaxXYZ, current, previous);
+ Simd4f finiteMass = splat<3>(previous) > sFloatMaxW;
+ Simd4f delta = (current - previous) * scale + *accelIt;
+ *curIt = current + (delta & finiteMass);
+ *prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass);
+ }
+}
+
+template <typename Simd4f, typename AccelerationIterator>
+void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt,
+ const Simd4f (&prevMatrix)[3], const Simd4f (&curMatrix)[3], const AccelerationIterator& aIt,
+ const Simd4f& prevBias)
+{
+ // local copy to avoid LHS
+ AccelerationIterator accelIt(aIt);
+
+ for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt)
+ {
+ Simd4f current = *curIt;
+ Simd4f previous = *prevIt;
+ // if(current.w == 0) current.w = previous.w
+ current = select(current > sMinusFloatMaxXYZ, current, previous);
+ Simd4f finiteMass = splat<3>(previous) > sFloatMaxW;
+ // curMatrix*current + prevMatrix*previous + accel
+ Simd4f delta = cloth::transform(curMatrix, cloth::transform(prevMatrix, *accelIt, previous), current);
+ *curIt = current + (delta & finiteMass);
+ *prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass);
+ }
+}
+
+template <typename Simd4f, typename ConstraintIterator>
+void constrainMotion(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres,
+ const Simd4f& scaleBiasStiffness)
+{
+ Simd4f scale = splat<0>(scaleBiasStiffness);
+ Simd4f bias = splat<1>(scaleBiasStiffness);
+ Simd4f stiffness = splat<3>(scaleBiasStiffness);
+
+ // local copy of iterator to maintain alignment
+ ConstraintIterator sphIt = spheres;
+
+ for(; curIt < curEnd; curIt += 4)
+ {
+ // todo: use msub where available
+ Simd4f curPos0 = curIt[0];
+ Simd4f curPos1 = curIt[1];
+ Simd4f curPos2 = curIt[2];
+ Simd4f curPos3 = curIt[3];
+
+ Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0);
+ ++sphIt;
+ Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1);
+ ++sphIt;
+ Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2);
+ ++sphIt;
+ Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3);
+ ++sphIt;
+
+ Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3;
+ transpose(deltaX, deltaY, deltaZ, deltaW);
+
+ Simd4f sqrLength = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+ Simd4f radius = max(gSimd4fZero, deltaW * scale + bias);
+
+ Simd4f slack = gSimd4fOne - radius * rsqrt(sqrLength);
+
+ // if slack <= 0.0f then we don't want to affect particle
+ // and can skip if all particles are unaffected
+ Simd4f isPositive;
+ if(anyGreater(slack, gSimd4fZero, isPositive))
+ {
+ // set invMass to zero if radius is zero
+ curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ);
+ curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ);
+ curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ);
+ curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ);
+
+ slack = slack * stiffness & isPositive;
+
+ curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack);
+ curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack);
+ curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack);
+ curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack);
+ }
+ }
+}
+
+template <typename Simd4f, typename ConstraintIterator>
+void constrainSeparation(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres)
+{
+ // local copy of iterator to maintain alignment
+ ConstraintIterator sphIt = spheres;
+
+ for(; curIt < curEnd; curIt += 4)
+ {
+ // todo: use msub where available
+ Simd4f curPos0 = curIt[0];
+ Simd4f curPos1 = curIt[1];
+ Simd4f curPos2 = curIt[2];
+ Simd4f curPos3 = curIt[3];
+
+ Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0);
+ ++sphIt;
+ Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1);
+ ++sphIt;
+ Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2);
+ ++sphIt;
+ Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3);
+ ++sphIt;
+
+ Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3;
+ transpose(deltaX, deltaY, deltaZ, deltaW);
+
+ Simd4f sqrLength = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+ Simd4f slack = gSimd4fOne - deltaW * rsqrt<1>(sqrLength);
+
+ // if slack >= 0.0f then we don't want to affect particle
+ // and can skip if all particles are unaffected
+ Simd4f isNegative;
+ if(anyGreater(gSimd4fZero, slack, isNegative))
+ {
+ slack = slack & isNegative;
+
+ curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack);
+ curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack);
+ curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack);
+ curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack);
+ }
+ }
+}
+
+/**
+ traditional gauss-seidel internal constraint solver
+ */
+template <bool useMultiplier, typename Simd4f>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+ const uint16_t* __restrict iIt, const Simd4f& stiffnessEtc)
+{
+ Simd4f stretchLimit, compressionLimit, multiplier;
+ if(useMultiplier)
+ {
+ stretchLimit = splat<3>(stiffnessEtc);
+ compressionLimit = splat<2>(stiffnessEtc);
+ multiplier = splat<1>(stiffnessEtc);
+ }
+ Simd4f stiffness = splat<0>(stiffnessEtc);
+
+ for(; rIt != rEnd; rIt += 4, iIt += 8)
+ {
+ uint32_t p0i = iIt[0] * sizeof(PxVec4);
+ uint32_t p0j = iIt[1] * sizeof(PxVec4);
+ uint32_t p1i = iIt[2] * sizeof(PxVec4);
+ uint32_t p1j = iIt[3] * sizeof(PxVec4);
+ uint32_t p2i = iIt[4] * sizeof(PxVec4);
+ uint32_t p2j = iIt[5] * sizeof(PxVec4);
+ uint32_t p3i = iIt[6] * sizeof(PxVec4);
+ uint32_t p3j = iIt[7] * sizeof(PxVec4);
+
+ Simd4f v0i = loadAligned(posIt, p0i);
+ Simd4f v0j = loadAligned(posIt, p0j);
+ Simd4f v1i = loadAligned(posIt, p1i);
+ Simd4f v1j = loadAligned(posIt, p1j);
+ Simd4f v2i = loadAligned(posIt, p2i);
+ Simd4f v2j = loadAligned(posIt, p2j);
+ Simd4f v3i = loadAligned(posIt, p3i);
+ Simd4f v3j = loadAligned(posIt, p3j);
+
+ Simd4f h0ij = v0j + v0i * sMinusOneXYZOneW;
+ Simd4f h1ij = v1j + v1i * sMinusOneXYZOneW;
+ Simd4f h2ij = v2j + v2i * sMinusOneXYZOneW;
+ Simd4f h3ij = v3j + v3i * sMinusOneXYZOneW;
+
+ Simd4f hxij = h0ij, hyij = h1ij, hzij = h2ij, vwij = h3ij;
+ transpose(hxij, hyij, hzij, vwij);
+
+ Simd4f rij = loadAligned(rIt);
+ Simd4f e2ij = gSimd4fEpsilon + hxij * hxij + hyij * hyij + hzij * hzij;
+ Simd4f erij = (gSimd4fOne - rij * rsqrt(e2ij)) & (rij > gSimd4fEpsilon);
+
+ if(useMultiplier)
+ {
+ erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit));
+ }
+ Simd4f exij = erij * stiffness * recip(gSimd4fEpsilon + vwij);
+
+ h0ij = h0ij * splat<0>(exij) & sMaskXYZ;
+ h1ij = h1ij * splat<1>(exij) & sMaskXYZ;
+ h2ij = h2ij * splat<2>(exij) & sMaskXYZ;
+ h3ij = h3ij * splat<3>(exij) & sMaskXYZ;
+
+ storeAligned(posIt, p0i, v0i + h0ij * splat<3>(v0i));
+ storeAligned(posIt, p0j, v0j - h0ij * splat<3>(v0j));
+ storeAligned(posIt, p1i, v1i + h1ij * splat<3>(v1i));
+ storeAligned(posIt, p1j, v1j - h1ij * splat<3>(v1j));
+ storeAligned(posIt, p2i, v2i + h2ij * splat<3>(v2i));
+ storeAligned(posIt, p2j, v2j - h2ij * splat<3>(v2j));
+ storeAligned(posIt, p3i, v3i + h3ij * splat<3>(v3i));
+ storeAligned(posIt, p3j, v3j - h3ij * splat<3>(v3j));
+ }
+}
+
+#if PX_WINDOWS
+#include "sse2/SwSolveConstraints.h"
+#endif
+
+// calculates upper bound of all position deltas
+template <typename Simd4f>
+Simd4f calculateMaxDelta(const Simd4f* prevIt, const Simd4f* curIt, const Simd4f* curEnd)
+{
+ Simd4f maxDelta = gSimd4fZero;
+ for(; curIt < curEnd; ++curIt, ++prevIt)
+ maxDelta = max(maxDelta, abs(*curIt - *prevIt));
+
+ return maxDelta & sMaskXYZ;
+}
+
+template <bool IsTurning, typename Simd4f>
+void applyWind(Simd4f* __restrict curIt, const Simd4f* __restrict prevIt, const uint16_t* __restrict tIt,
+ const uint16_t* __restrict tEnd, Simd4f dragCoefficient, Simd4f liftCoefficient, Simd4f wind,
+ const Simd4f (&rotation)[3])
+{
+ const Simd4f oneThird = simd4f(1 / 3.0f);
+
+ for(; tIt < tEnd; tIt += 3)
+ {
+ uint16_t i0 = tIt[0];
+ uint16_t i1 = tIt[1];
+ uint16_t i2 = tIt[2];
+
+ Simd4f c0 = curIt[i0];
+ Simd4f c1 = curIt[i1];
+ Simd4f c2 = curIt[i2];
+
+ Simd4f p0 = prevIt[i0];
+ Simd4f p1 = prevIt[i1];
+ Simd4f p2 = prevIt[i2];
+
+ // use particle weights instead?
+ Simd4f current = (c0 + c1 + c2) * oneThird;
+ Simd4f previous = (p0 + p1 + p2) * oneThird;
+
+ Simd4f delta = current - previous + wind;
+
+ if(IsTurning)
+ {
+ // add rotation of frame
+ delta = cloth::transform(rotation, delta - current, current);
+ }
+
+ Simd4f normal = cross3(c2 - c0, c1 - c0);
+
+ Simd4f invSqrScale = dot3(delta, delta) * dot3(normal, normal);
+ Simd4f isZero = invSqrScale < gSimd4fEpsilon;
+ Simd4f scale = rsqrt(invSqrScale);
+
+ Simd4f cosTheta = dot3(normal, delta) * scale;
+ Simd4f sinTheta = sqrt(max(gSimd4fZero, gSimd4fOne - cosTheta * cosTheta));
+
+ // orthogonal to delta, in delta-normal plane, same length as delta
+ Simd4f liftDir = cross3(cross3(delta, normal), delta * scale);
+
+ // sin(theta) * cos(theta) = 0.5 * sin(2 * theta)
+ Simd4f lift = liftCoefficient * cosTheta * sinTheta * liftDir;
+ Simd4f drag = dragCoefficient * abs(cosTheta) * delta;
+
+ Simd4f impulse = (lift + drag) & ~isZero;
+
+ curIt[i0] = c0 - impulse * splat<3>(c0);
+ curIt[i1] = c1 - impulse * splat<3>(c1);
+ curIt[i2] = c2 - impulse * splat<3>(c2);
+ }
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+cloth::SwSolverKernel<Simd4f>::SwSolverKernel(SwCloth const& cloth, SwClothData& clothData,
+ SwKernelAllocator& allocator, IterationStateFactory& factory)
+: mCloth(cloth)
+, mClothData(clothData)
+, mAllocator(allocator)
+, mCollision(clothData, allocator)
+, mSelfCollision(clothData, allocator)
+, mState(factory.create<Simd4f>(cloth))
+{
+ mClothData.verify();
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::operator()()
+{
+ simulateCloth();
+}
+
+template <typename Simd4f>
+size_t cloth::SwSolverKernel<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+ size_t collisionTempMemory = SwCollision<Simd4f>::estimateTemporaryMemory(cloth);
+ size_t selfCollisionTempMemory = SwSelfCollision<Simd4f>::estimateTemporaryMemory(cloth);
+
+ size_t tempMemory = PxMax(collisionTempMemory, selfCollisionTempMemory);
+ size_t persistentMemory = SwCollision<Simd4f>::estimatePersistentMemory(cloth);
+
+ // account for any allocator overhead (this could be exposed in the allocator)
+ size_t maxAllocs = 32;
+ size_t maxPerAllocationOverhead = 32;
+ size_t maxAllocatorOverhead = maxAllocs * maxPerAllocationOverhead;
+
+ return maxAllocatorOverhead + persistentMemory + tempMemory;
+}
+
+template <typename Simd4f>
+template <typename AccelerationIterator>
+void cloth::SwSolverKernel<Simd4f>::integrateParticles(AccelerationIterator& accelIt, const Simd4f& prevBias)
+{
+ Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ Simd4f* curEnd = curIt + mClothData.mNumParticles;
+ Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+
+ if(!mState.mIsTurning)
+ ::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix[0], accelIt, prevBias);
+ else
+ ::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix, mState.mCurMatrix, accelIt, prevBias);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::integrateParticles()
+{
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::integrateParticles", 0);
+
+ const Simd4f* startAccelIt = reinterpret_cast<const Simd4f*>(mClothData.mParticleAccelerations);
+
+ // dt^2 (todo: should this be the smoothed dt used for gravity?)
+ const Simd4f sqrIterDt = simd4f(sqr(mState.mIterDt)) & static_cast<Simd4f>(sMaskXYZ);
+
+ if(!startAccelIt)
+ {
+ // no per-particle accelerations, use a constant
+ ConstantIterator<Simd4f> accelIt(mState.mCurBias);
+ integrateParticles(accelIt, mState.mPrevBias);
+ }
+ else
+ {
+ // iterator implicitly scales by dt^2 and adds gravity
+ ScaleBiasIterator<Simd4f, const Simd4f*> accelIt(startAccelIt, sqrIterDt, mState.mCurBias);
+ integrateParticles(accelIt, mState.mPrevBias);
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainTether()
+{
+ if(0.0f == mClothData.mTetherConstraintStiffness || !mClothData.mNumTethers)
+ return;
+
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::solveTethers", 0);
+
+ uint32_t numParticles = mClothData.mNumParticles;
+ uint32_t numTethers = mClothData.mNumTethers;
+ PX_ASSERT(0 == numTethers % numParticles);
+
+ float* __restrict curIt = mClothData.mCurParticles;
+ const float* __restrict curFirst = curIt;
+ const float* __restrict curEnd = curIt + 4 * numParticles;
+
+ typedef const SwTether* __restrict TetherIter;
+ TetherIter tFirst = mClothData.mTethers;
+ TetherIter tEnd = tFirst + numTethers;
+
+ Simd4f stiffness =
+ static_cast<Simd4f>(sMaskXYZ) & simd4f(numParticles * mClothData.mTetherConstraintStiffness / numTethers);
+ Simd4f scale = simd4f(mClothData.mTetherConstraintScale);
+
+ for(; curIt != curEnd; curIt += 4, ++tFirst)
+ {
+ Simd4f position = loadAligned(curIt);
+ Simd4f offset = gSimd4fZero;
+
+ for(TetherIter tIt = tFirst; tIt < tEnd; tIt += numParticles)
+ {
+ PX_ASSERT(tIt->mAnchor < numParticles);
+ Simd4f anchor = loadAligned(curFirst, tIt->mAnchor * sizeof(PxVec4));
+ Simd4f delta = anchor - position;
+ Simd4f sqrLength = gSimd4fEpsilon + dot3(delta, delta);
+
+ Simd4f tetherLength = load(&tIt->mLength);
+ tetherLength = splat<0>(tetherLength);
+
+ Simd4f radius = tetherLength * scale;
+ Simd4f slack = gSimd4fOne - radius * rsqrt(sqrLength);
+
+ offset = offset + delta * max(slack, gSimd4fZero);
+ }
+
+ storeAligned(curIt, position + offset * stiffness);
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::solveFabric()
+{
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::solveFabric", 0);
+
+ float* pIt = mClothData.mCurParticles;
+
+ const PhaseConfig* cIt = mClothData.mConfigBegin;
+ const PhaseConfig* cEnd = mClothData.mConfigEnd;
+
+ const uint32_t* pBegin = mClothData.mPhases;
+ const float* rBegin = mClothData.mRestvalues;
+
+ const uint32_t* sBegin = mClothData.mSets;
+ const uint16_t* iBegin = mClothData.mIndices;
+
+ uint32_t totalConstraints = 0;
+
+ Simd4f stiffnessExponent = simd4f(mCloth.mStiffnessFrequency * mState.mIterDt);
+
+ for(; cIt != cEnd; ++cIt)
+ {
+ const uint32_t* sIt = sBegin + pBegin[cIt->mPhaseIndex];
+ const float* rIt = rBegin + sIt[0];
+ const float* rEnd = rBegin + sIt[1];
+ const uint16_t* iIt = iBegin + sIt[0] * 2;
+
+ totalConstraints += uint32_t(rEnd - rIt);
+
+ // (stiffness, multiplier, compressionLimit, stretchLimit)
+ Simd4f config = load(&cIt->mStiffness);
+ // stiffness specified as fraction of constraint error per-millisecond
+ Simd4f scaledConfig = gSimd4fOne - exp2(config * stiffnessExponent);
+ Simd4f stiffness = select(sMaskXY, scaledConfig, config);
+
+ int neutralMultiplier = allEqual(sMaskYZW & stiffness, gSimd4fZero);
+
+#if PX_AVX
+ switch(sAvxSupport)
+ {
+ case 2:
+#if _MSC_VER >= 1700
+ neutralMultiplier ? avx::solveConstraints<false, 2>(pIt, rIt, rEnd, iIt, stiffness)
+ : avx::solveConstraints<true, 2>(pIt, rIt, rEnd, iIt, stiffness);
+ break;
+#endif
+ case 1:
+ neutralMultiplier ? avx::solveConstraints<false, 1>(pIt, rIt, rEnd, iIt, stiffness)
+ : avx::solveConstraints<true, 1>(pIt, rIt, rEnd, iIt, stiffness);
+ break;
+ default:
+#endif
+ neutralMultiplier ? solveConstraints<false>(pIt, rIt, rEnd, iIt, stiffness)
+ : solveConstraints<true>(pIt, rIt, rEnd, iIt, stiffness);
+#if PX_AVX
+ break;
+ }
+#endif
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::applyWind()
+{
+ if(mClothData.mDragCoefficient == 0.0f && mClothData.mLiftCoefficient == 0.0f)
+ return;
+
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::applyWind", 0);
+
+ Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+
+ const uint16_t* tIt = mClothData.mTriangles;
+ const uint16_t* tEnd = tIt + 3 * mClothData.mNumTriangles;
+
+ Simd4f dragCoefficient = simd4f(mClothData.mDragCoefficient);
+ Simd4f liftCoefficient = simd4f(mClothData.mLiftCoefficient);
+
+ if(mState.mIsTurning)
+ {
+ ::applyWind<true>(curIt, prevIt, tIt, tEnd, dragCoefficient, liftCoefficient, mState.mWind,
+ mState.mRotationMatrix);
+ }
+ else
+ {
+ ::applyWind<false>(curIt, prevIt, tIt, tEnd, dragCoefficient, liftCoefficient, mState.mWind,
+ mState.mRotationMatrix);
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainMotion()
+{
+ if(!mClothData.mStartMotionConstraints)
+ return;
+
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::constrainMotion", 0);
+
+ Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+ const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartMotionConstraints);
+ const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetMotionConstraints);
+
+ Simd4f scaleBias = load(&mCloth.mMotionConstraintScale);
+ Simd4f stiffness = simd4f(mClothData.mMotionConstraintStiffness);
+ Simd4f scaleBiasStiffness = select(sMaskXYZ, scaleBias, stiffness);
+
+ if(!mClothData.mTargetMotionConstraints)
+ // no interpolation, use the start positions
+ return ::constrainMotion(curIt, curEnd, startIt, scaleBiasStiffness);
+
+ if(mState.mRemainingIterations == 1)
+ // use the target positions on last iteration
+ return ::constrainMotion(curIt, curEnd, targetIt, scaleBiasStiffness);
+
+ // otherwise use an interpolating iterator
+ LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha());
+ ::constrainMotion(curIt, curEnd, interpolator, scaleBiasStiffness);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainSeparation()
+{
+ if(!mClothData.mStartSeparationConstraints)
+ return;
+
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::constrainSeparation", 0);
+
+ Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+ const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartSeparationConstraints);
+ const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetSeparationConstraints);
+
+ if(!mClothData.mTargetSeparationConstraints)
+ // no interpolation, use the start positions
+ return ::constrainSeparation(curIt, curEnd, startIt);
+
+ if(mState.mRemainingIterations == 1)
+ // use the target positions on last iteration
+ return ::constrainSeparation(curIt, curEnd, targetIt);
+
+ // otherwise use an interpolating iterator
+ LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha());
+ ::constrainSeparation(curIt, curEnd, interpolator);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::collideParticles()
+{
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::collideParticles", 0);
+
+ mCollision(mState);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::selfCollideParticles()
+{
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::selfCollideParticles", 0);
+
+ mSelfCollision();
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::updateSleepState()
+{
+ PX_PROFILE_ZONE("cloth::SwSolverKernel::updateSleepState", 0);
+
+ mClothData.mSleepTestCounter += PxMax(1u, uint32_t(mState.mIterDt * 1000));
+ if(mClothData.mSleepTestCounter >= mCloth.mSleepTestInterval)
+ {
+ const Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+ const Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+ const Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+ // calculate max particle delta since last iteration
+ Simd4f maxDelta = calculateMaxDelta(prevIt, curIt, curEnd);
+
+ ++mClothData.mSleepPassCounter;
+ Simd4f threshold = simd4f(mCloth.mSleepThreshold * mState.mIterDt);
+ if(anyGreaterEqual(maxDelta, threshold))
+ mClothData.mSleepPassCounter = 0;
+
+ mClothData.mSleepTestCounter -= mCloth.mSleepTestInterval;
+ }
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::iterateCloth()
+{
+ // note on invMass (stored in current/previous positions.w):
+ // integrateParticles()
+ // - if(current.w == 0) current.w = previous.w
+ // constraintMotion()
+ // - if(constraint.radius <= 0) current.w = 0
+ // computeBounds()
+ // - if(current.w > 0) current.w = previous.w
+ // collideParticles()
+ // - if(collides) current.w *= 1/massScale
+ // after simulate()
+ // - previous.w: original invMass as set by user
+ // - current.w: zeroed by motion constraints and mass-scaled by collision
+
+ // integrate positions
+ integrateParticles();
+
+ // apply drag and lift
+ applyWind();
+
+ // motion constraints
+ constrainMotion();
+
+ // solve tether constraints
+ constrainTether();
+
+ // solve edge constraints
+ solveFabric();
+
+ // separation constraints
+ constrainSeparation();
+
+ // perform character collision
+ collideParticles();
+
+ // perform self collision
+ selfCollideParticles();
+
+ // test wake / sleep conditions
+ updateSleepState();
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::simulateCloth()
+{
+ while(mState.mRemainingIterations)
+ {
+ iterateCloth();
+ mState.update();
+ }
+}
+
+// explicit template instantiation
+#if NV_SIMD_SIMD
+template class cloth::SwSolverKernel<Simd4f>;
+#endif
+#if NV_SIMD_SCALAR
+template class cloth::SwSolverKernel<Scalar4f>;
+#endif
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.h b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.h
new file mode 100644
index 00000000..9ad546c0
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.h
@@ -0,0 +1,84 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "IterationState.h"
+#include "SwCollision.h"
+#include "SwSelfCollision.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+template <typename Simd4f>
+class SwSolverKernel
+{
+ public:
+ SwSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&);
+
+ void operator()();
+
+ // returns a conservative estimate of the
+ // total memory requirements during a solve
+ static size_t estimateTemporaryMemory(const SwCloth& c);
+
+ private:
+ void integrateParticles();
+ void constrainTether();
+ void solveFabric();
+ void applyWind();
+ void constrainMotion();
+ void constrainSeparation();
+ void collideParticles();
+ void selfCollideParticles();
+ void updateSleepState();
+
+ void iterateCloth();
+ void simulateCloth();
+
+ SwCloth const& mCloth;
+ SwClothData& mClothData;
+ SwKernelAllocator& mAllocator;
+
+ SwCollision<Simd4f> mCollision;
+ SwSelfCollision<Simd4f> mSelfCollision;
+ IterationState<Simd4f> mState;
+
+ private:
+ SwSolverKernel<Simd4f>& operator=(const SwSolverKernel<Simd4f>&);
+ template <typename AccelerationIterator>
+ void integrateParticles(AccelerationIterator& accelIt, const Simd4f&);
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.cpp b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.cpp
new file mode 100644
index 00000000..ea062136
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.cpp
@@ -0,0 +1,246 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMath.h"
+#include "TripletScheduler.h"
+#include "PsUtilities.h"
+#include "PsFPU.h"
+
+using namespace physx;
+using namespace shdfnd;
+
+cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets)
+: mTriplets(reinterpret_cast<const Vec4u*>(triplets.begin()), reinterpret_cast<const Vec4u*>(triplets.end()))
+{
+}
+
+// SSE version
+void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth)
+{
+ if(mTriplets.empty())
+ return;
+
+ Vector<uint32_t>::Type mark(numParticles, uint32_t(-1));
+
+ uint32_t setIndex = 0, setSize = 0;
+ for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex)
+ {
+ TripletIter tLast = tIt + PxMin(simdWidth, uint32_t(tEnd - tIt));
+ TripletIter tSwap = tEnd;
+
+ for(; tIt != tLast && tIt != tSwap; ++tIt, ++setSize)
+ {
+ // swap from tail until independent triplet found
+ while((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap)
+ swap(*tIt, *tSwap);
+
+ if(tIt == tSwap)
+ break; // no independent triplet found
+
+ // mark vertices to be used in simdIndex
+ mark[tIt->x] = setIndex;
+ mark[tIt->y] = setIndex;
+ mark[tIt->z] = setIndex;
+ }
+
+ if(tIt == tSwap) // remaining triplets depend on current set
+ {
+ if(setSize > simdWidth) // trim set to multiple of simdWidth
+ {
+ uint32_t overflow = setSize % simdWidth;
+ setSize -= overflow;
+ tIt -= overflow;
+ }
+ mSetSizes.pushBack(setSize);
+ setSize = 0;
+ }
+ }
+}
+
+namespace
+{
+struct TripletSet
+{
+ TripletSet() : mMark(0xFFFFFFFF)
+ {
+ mNumReplays[0] = mNumReplays[1] = mNumReplays[2] = 1;
+ memset(mNumConflicts[0], 0, 32);
+ memset(mNumConflicts[1], 0, 32);
+ memset(mNumConflicts[2], 0, 32);
+ }
+
+ uint32_t mMark; // triplet index
+ uint8_t mNumReplays[3];
+ uint8_t mNumConflicts[3][32];
+};
+
+/*
+struct GreaterSum
+{
+ typedef cloth::Vector<uint32_t>::Type Container;
+
+ GreaterSum(const Container& cont)
+ : mContainer(cont)
+ {}
+
+ bool operator()(const cloth::Vec4u& a, const cloth::Vec4u& b) const
+ {
+ return mContainer[a.x] + mContainer[a.y] + mContainer[a.z]
+ > mContainer[b.x] + mContainer[b.y] + mContainer[b.z];
+ }
+
+ const Container& mContainer;
+};
+*/
+
+// calculate the inclusive prefix sum, equivalent of std::partial_sum
+template <typename T>
+void prefixSum(const T* first, const T* last, T* dest)
+{
+ if(first == last)
+ return;
+ else
+ {
+ *(dest++) = *(first++);
+
+ for(; first != last; ++first, ++dest)
+ *dest = *(dest - 1) + *first;
+ }
+}
+}
+
+// CUDA version
+void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+ // PX_ASSERT(warpWidth == 32 || warpWidth == 16);
+
+ if(mTriplets.empty())
+ return;
+
+ TripletIter tIt, tEnd = mTriplets.end();
+ uint32_t tripletIndex;
+
+ // count number of triplets per particle
+ Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0));
+ for(tIt = mTriplets.begin(); tIt != tEnd; ++tIt)
+ for(int i = 0; i < 3; ++i)
+ ++adjacentCount[(*tIt)[i]];
+
+ /* neither of those were really improving number of batches:
+ // run simd version to pre-sort particles
+ simd(numParticles, blockWidth); mSetSizes.resize(0);
+ // sort according to triplet degree (estimated by sum of adjacentCount)
+ std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount));
+ */
+
+ uint32_t maxTripletCount = *maxElement(adjacentCount.begin(), adjacentCount.end());
+
+ // compute in place prefix sum (inclusive)
+ prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin());
+
+ // initialize adjacencies (for each particle, collect touching triplets)
+ // also converts partial sum in adjacentCount from inclusive to exclusive
+ Vector<uint32_t>::Type adjacencies(adjacentCount.back());
+ for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+ for(int i = 0; i < 3; ++i)
+ adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex;
+
+ uint32_t warpMask = warpWidth - 1;
+
+ uint32_t numSets = maxTripletCount; // start with minimum number of sets
+ Vector<TripletSet>::Type sets(numSets);
+ Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1));
+ mSetSizes.resize(numSets);
+
+ // color triplets (assign to sets)
+ Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd;
+ for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+ {
+ // mark sets of adjacent triplets
+ for(int i = 0; i < 3; ++i)
+ {
+ uint32_t particleIndex = (*tIt)[i];
+ aIt = aBegin + adjacentCount[particleIndex];
+ aEnd = aBegin + adjacentCount[particleIndex + 1];
+ for(uint32_t setIndex; aIt != aEnd; ++aIt)
+ if(numSets > (setIndex = setIndices[*aIt]))
+ sets[setIndex].mMark = tripletIndex;
+ }
+
+ // find valid set with smallest number of bank conflicts
+ uint32_t bestIndex = numSets;
+ uint32_t minReplays = 4;
+ for(uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex)
+ {
+ const TripletSet& set = sets[setIndex];
+
+ if(set.mMark == tripletIndex)
+ continue; // triplet collision
+
+ uint32_t numReplays = 0;
+ for(uint32_t i = 0; i < 3; ++i)
+ numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]];
+
+ if(minReplays > numReplays)
+ {
+ minReplays = numReplays;
+ bestIndex = setIndex;
+ }
+ }
+
+ // add new set if none found
+ if(bestIndex == numSets)
+ {
+ sets.pushBack(TripletSet());
+ mSetSizes.pushBack(0);
+ ++numSets;
+ }
+
+ // increment bank conflicts or reset if warp filled
+ TripletSet& set = sets[bestIndex];
+ if(++mSetSizes[bestIndex] & warpMask)
+ for(uint32_t i = 0; i < 3; ++i)
+ set.mNumReplays[i] = PxMax(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]);
+ else
+ set = TripletSet();
+
+ setIndices[tripletIndex] = bestIndex;
+ }
+
+ // reorder triplets
+ Vector<uint32_t>::Type setOffsets(mSetSizes.size());
+ prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin());
+
+ Vector<Vec4u>::Type triplets(mTriplets.size());
+ Vector<uint32_t>::Type::ConstIterator iIt = setIndices.begin();
+ for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++iIt)
+ triplets[--setOffsets[*iIt]] = *tIt;
+
+ mTriplets.swap(triplets);
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.h b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.h
new file mode 100644
index 00000000..db8078ab
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.h
@@ -0,0 +1,56 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Range.h"
+#include "Allocator.h"
+#include "Vec4T.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+struct TripletScheduler
+{
+ typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter;
+ typedef Vector<Vec4u>::Type::Iterator TripletIter;
+
+ TripletScheduler(Range<const uint32_t[4]>);
+ void simd(uint32_t numParticles, uint32_t simdWidth);
+ void warp(uint32_t numParticles, uint32_t warpWidth);
+
+ Vector<Vec4u>::Type mTriplets;
+ Vector<uint32_t>::Type mSetSizes;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Vec4T.h b/PhysX_3.4/Source/LowLevelCloth/src/Vec4T.h
new file mode 100644
index 00000000..50fadca3
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/Vec4T.h
@@ -0,0 +1,104 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+template <typename T>
+struct Vec4T
+{
+ Vec4T()
+ {
+ }
+
+ Vec4T(T a, T b, T c, T d) : x(a), y(b), z(c), w(d)
+ {
+ }
+
+ template <typename S>
+ Vec4T(const Vec4T<S>& other)
+ {
+ x = T(other.x);
+ y = T(other.y);
+ z = T(other.z);
+ w = T(other.w);
+ }
+
+ template <typename Index>
+ T& operator[](Index i)
+ {
+ return reinterpret_cast<T*>(this)[i];
+ }
+
+ template <typename Index>
+ const T& operator[](Index i) const
+ {
+ return reinterpret_cast<const T*>(this)[i];
+ }
+
+ T x, y, z, w;
+};
+
+template <typename T>
+Vec4T<T> operator*(const Vec4T<T>& vec, T scalar)
+{
+ return Vec4T<T>(vec.x * scalar, vec.y * scalar, vec.z * scalar, vec.w * scalar);
+}
+
+template <typename T>
+Vec4T<T> operator/(const Vec4T<T>& vec, T scalar)
+{
+ return Vec4T<T>(vec.x / scalar, vec.y / scalar, vec.z / scalar, vec.w / scalar);
+}
+
+template <typename T>
+T (&array(Vec4T<T>& vec))[4]
+{
+ return reinterpret_cast<T(&)[4]>(vec);
+}
+
+template <typename T>
+const T (&array(const Vec4T<T>& vec))[4]
+{
+ return reinterpret_cast<const T(&)[4]>(vec);
+}
+
+typedef Vec4T<uint32_t> Vec4u;
+typedef Vec4T<uint16_t> Vec4us;
+
+} // namespace cloth
+
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp b/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp
new file mode 100644
index 00000000..b242aaba
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp
@@ -0,0 +1,932 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>
+#pragma warning(pop)
+
+#pragma warning(disable : 4127) // conditional expression is constant
+
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+
+namespace avx
+{
+__m128 sMaskYZW;
+__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY;
+
+void initialize()
+{
+ sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0));
+ sOne = _mm256_set1_ps(1.0f);
+ sEpsilon = _mm256_set1_ps(1.192092896e-07f);
+ sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f);
+ sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0));
+}
+
+template <uint32_t>
+__m256 fmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_add_ps(_mm256_mul_ps(a, b), c);
+}
+template <uint32_t>
+__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
+}
+#if _MSC_VER >= 1700
+template <>
+__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_fmadd_ps(a, b, c);
+}
+template <>
+__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_fnmadd_ps(a, b, c);
+}
+#endif
+
+// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster
+template <bool useMultiplier, uint32_t avx>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+ const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness, stretchLimit, compressionLimit, multiplier;
+
+ if(useMultiplier)
+ {
+ stiffness = _mm256_broadcast_ps(&stiffnessRef);
+ stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+ compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+ multiplier = _mm256_permute_ps(stiffness, 0x55);
+ stiffness = _mm256_permute_ps(stiffness, 0x00);
+ }
+ else
+ {
+ stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+ }
+
+ for(; rIt < rEnd; rIt += 8, iIt += 16)
+ {
+ float* p0i = posIt + iIt[0] * 4;
+ float* p4i = posIt + iIt[8] * 4;
+ float* p0j = posIt + iIt[1] * 4;
+ float* p4j = posIt + iIt[9] * 4;
+ float* p1i = posIt + iIt[2] * 4;
+ float* p5i = posIt + iIt[10] * 4;
+ float* p1j = posIt + iIt[3] * 4;
+ float* p5j = posIt + iIt[11] * 4;
+
+ __m128 v0i = _mm_load_ps(p0i);
+ __m128 v4i = _mm_load_ps(p4i);
+ __m128 v0j = _mm_load_ps(p0j);
+ __m128 v4j = _mm_load_ps(p4j);
+ __m128 v1i = _mm_load_ps(p1i);
+ __m128 v5i = _mm_load_ps(p5i);
+ __m128 v1j = _mm_load_ps(p1j);
+ __m128 v5j = _mm_load_ps(p5j);
+
+ __m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1);
+ __m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1);
+ __m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1);
+ __m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1);
+
+ __m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j);
+ __m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j);
+
+ float* p2i = posIt + iIt[4] * 4;
+ float* p6i = posIt + iIt[12] * 4;
+ float* p2j = posIt + iIt[5] * 4;
+ float* p6j = posIt + iIt[13] * 4;
+ float* p3i = posIt + iIt[6] * 4;
+ float* p7i = posIt + iIt[14] * 4;
+ float* p3j = posIt + iIt[7] * 4;
+ float* p7j = posIt + iIt[15] * 4;
+
+ __m128 v2i = _mm_load_ps(p2i);
+ __m128 v6i = _mm_load_ps(p6i);
+ __m128 v2j = _mm_load_ps(p2j);
+ __m128 v6j = _mm_load_ps(p6j);
+ __m128 v3i = _mm_load_ps(p3i);
+ __m128 v7i = _mm_load_ps(p7i);
+ __m128 v3j = _mm_load_ps(p3j);
+ __m128 v7j = _mm_load_ps(p7j);
+
+ __m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1);
+ __m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1);
+ __m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1);
+ __m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1);
+
+ __m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j);
+ __m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j);
+
+ __m256 a = _mm256_unpacklo_ps(h04ij, h26ij);
+ __m256 b = _mm256_unpackhi_ps(h04ij, h26ij);
+ __m256 c = _mm256_unpacklo_ps(h15ij, h37ij);
+ __m256 d = _mm256_unpackhi_ps(h15ij, h37ij);
+
+ __m256 hxij = _mm256_unpacklo_ps(a, c);
+ __m256 hyij = _mm256_unpackhi_ps(a, c);
+ __m256 hzij = _mm256_unpacklo_ps(b, d);
+ __m256 vwij = _mm256_unpackhi_ps(b, d);
+
+ __m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon)));
+
+ __m256 rij = _mm256_load_ps(rIt);
+ __m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ);
+ __m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask);
+
+ if(useMultiplier)
+ {
+ erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij);
+ }
+
+ __m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stiffness, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij))));
+
+ // replace these two instructions with _mm_maskstore_ps below?
+ __m256 exlo = _mm256_and_ps(sMaskXY, exij);
+ __m256 exhi = _mm256_andnot_ps(sMaskXY, exij);
+
+ __m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0));
+ __m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i);
+ __m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j);
+
+ _mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0));
+ _mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0));
+ _mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1));
+ _mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1));
+
+ __m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5));
+ __m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i);
+ __m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j);
+
+ _mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0));
+ _mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0));
+ _mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1));
+ _mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1));
+
+ __m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a));
+ __m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i);
+ __m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j);
+
+ _mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0));
+ _mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0));
+ _mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1));
+ _mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1));
+
+ __m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f));
+ __m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i);
+ __m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j);
+
+ _mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0));
+ _mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0));
+ _mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1));
+ _mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1));
+ }
+
+ _mm256_zeroupper();
+}
+
+#ifdef _M_IX86
+
+// clang-format:disable
+
+/* full template specializations of above functions in assembler */
+
+// AVX without useMultiplier
+template <>
+void solveConstraints<false, 1>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+ vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+ vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+ vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+
+// AVX with useMultiplier
+template <>
+void solveConstraints<true, 1>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+ __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+ __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+ __m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+ stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+ vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+ vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+ vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmovaps ymm0, stretchLimit // multiplier block
+ vmovaps ymm1, compressionLimit
+ vmovaps ymm2, multiplier
+ vminps ymm0, ymm0, ymm5
+ vmaxps ymm1, ymm1, ymm0
+ vmulps ymm2, ymm2, ymm1
+ vsubps ymm5, ymm5, ymm2
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+
+#if _MSC_VER >= 1700
+// AVX2 without useMultiplier
+template <>
+void solveConstraints<false, 2>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+ vfmadd213ps ymm2, ymm2, ymm4
+ vfmadd213ps ymm0, ymm0, ymm2
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+ vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+
+// AVX2 with useMultiplier
+template <>
+void solveConstraints<true, 2>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+ __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+ __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+ __m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+ stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+ vfmadd213ps ymm2, ymm2, ymm4
+ vfmadd213ps ymm0, ymm0, ymm2
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+ vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmovaps ymm0, stretchLimit // multiplier block
+ vmovaps ymm1, compressionLimit
+ vmovaps ymm2, multiplier
+ vminps ymm0, ymm0, ymm5
+ vmaxps ymm1, ymm1, ymm0
+ vfnmadd231ps ymm5, ymm1, ymm2
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+#endif // _MSC_VER >= 1700
+
+// clang-format:enable
+
+#else // _M_IX86
+
+template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+#endif // _M_IX86
+
+} // namespace avx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonCollision.cpp
new file mode 100644
index 00000000..1ecaf277
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonCollision.cpp
@@ -0,0 +1,34 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwCollision.cpp"
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSelfCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSelfCollision.cpp
new file mode 100644
index 00000000..1a652711
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSelfCollision.cpp
@@ -0,0 +1,34 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSelfCollision.cpp"
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSolverKernel.cpp b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSolverKernel.cpp
new file mode 100644
index 00000000..fa193fc2
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSolverKernel.cpp
@@ -0,0 +1,49 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSolverKernel.cpp"
+
+#include <cpu-features.h>
+
+namespace physx
+{
+namespace cloth
+{
+bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator,
+ IterationStateFactory& factory, PxProfileZone* profileZone)
+{
+ return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() &&
+ (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true);
+}
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/neon/SwCollisionHelpers.h
new file mode 100644
index 00000000..6f1b0f58
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/SwCollisionHelpers.h
@@ -0,0 +1,87 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef _M_ARM
+#include <arm_neon.h>
+#endif
+
+namespace physx
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#ifdef _M_ARM
+ __n64 t = { mask };
+ return 31 - (vclz_u32(t)).n64_u32[0];
+#else
+ return 31 - __builtin_clz(mask);
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+ int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31));
+ return vsubq_s32(vcvtq_s32_f32(v.f4), neg);
+}
+
+Simd4i horizontalOr(const Simd4i& mask)
+{
+ uint32x2_t hi = vget_high_u32(mask.u4);
+ uint32x2_t lo = vget_low_u32(mask.u4);
+ uint32x2_t tmp = vorr_u32(lo, hi);
+ uint32x2_t rev = vrev64_u32(tmp);
+ uint32x2_t res = vorr_u32(tmp, rev);
+ return vcombine_u32(res, res);
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+ PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack);
+ uint8x8x2_t lohiIndex = reinterpret_cast<const uint8x8x2_t&>(index);
+ byteIndex.val[0] = vtbl2_u8(lohiIndex, byteIndex.val[0]);
+ byteIndex.val[1] = vtbl2_u8(lohiIndex, byteIndex.val[1]);
+ mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2);
+ mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4);
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+ PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute);
+ const uint8x8x4_t* table = reinterpret_cast<const uint8x8x4_t*>(ptr);
+ result.val[0] = vtbl4_u8(*table, result.val[0]);
+ result.val[1] = vtbl4_u8(*table, result.val[1]);
+ return reinterpret_cast<const Simd4i&>(result);
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/scalar/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/scalar/SwCollisionHelpers.h
new file mode 100644
index 00000000..a5a0075f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/scalar/SwCollisionHelpers.h
@@ -0,0 +1,92 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+namespace physx
+{
+namespace cloth
+{
+
+#if !NV_SIMD_SIMD
+uint32_t findBitSet(uint32_t mask)
+{
+ uint32_t result = 0;
+ while(mask >>= 1)
+ ++result;
+ return result;
+}
+#endif
+
+inline Scalar4i intFloor(const Scalar4f& v)
+{
+ return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3])));
+}
+
+inline Scalar4i horizontalOr(const Scalar4i& mask)
+{
+ return simd4i(mask.i4[0] | mask.i4[1] | mask.i4[2] | mask.i4[3]);
+}
+
+template <>
+struct Gather<Scalar4i>
+{
+ inline Gather(const Scalar4i& index);
+ inline Scalar4i operator()(const Scalar4i*) const;
+
+ Scalar4i mIndex;
+ Scalar4i mOutOfRange;
+};
+
+Gather<Scalar4i>::Gather(const Scalar4i& index)
+{
+ uint32_t mask = /* sGridSize */ 8 - 1;
+
+ mIndex.u4[0] = index.u4[0] & mask;
+ mIndex.u4[1] = index.u4[1] & mask;
+ mIndex.u4[2] = index.u4[2] & mask;
+ mIndex.u4[3] = index.u4[3] & mask;
+
+ mOutOfRange.i4[0] = index.u4[0] & ~mask ? 0 : -1;
+ mOutOfRange.i4[1] = index.u4[1] & ~mask ? 0 : -1;
+ mOutOfRange.i4[2] = index.u4[2] & ~mask ? 0 : -1;
+ mOutOfRange.i4[3] = index.u4[3] & ~mask ? 0 : -1;
+}
+
+Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const
+{
+ const int32_t* base = ptr->i4;
+ const int32_t* index = mIndex.i4;
+ const int32_t* mask = mOutOfRange.i4;
+ return Scalar4i(base[index[0]] & mask[0], base[index[1]] & mask[1], base[index[2]] & mask[2],
+ base[index[3]] & mask[3]);
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h
new file mode 100644
index 00000000..85e33c3c
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h
@@ -0,0 +1,92 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#if PX_GCC_FAMILY
+#include <xmmintrin.h> // _BitScanForward
+#else
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h> // _BitScanForward
+#pragma warning(pop)
+#endif
+
+namespace physx
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#if defined(_MSC_VER)
+ unsigned long result;
+ _BitScanForward(&result, unsigned long(mask));
+ return result;
+#else
+ return __builtin_ffs(mask) - 1;
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+ Simd4i i = _mm_cvttps_epi32(v);
+ return _mm_sub_epi32(i, _mm_srli_epi32(simd4i(v), 31));
+}
+
+Simd4i horizontalOr(const Simd4i& mask)
+{
+ Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y
+ return tmp | _mm_shuffle_epi32(tmp, 0x4e); // w z y x -> y x w z
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+ mSelectQ = _mm_srai_epi32(index << 29, 31);
+ mSelectD = _mm_srai_epi32(index << 30, 31);
+ mSelectW = _mm_srai_epi32(index << 31, 31);
+ mOutOfRange = (index ^ sIntSignBit) > sSignedMask;
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+ // more efficient with _mm_shuffle_epi8 (SSSE3)
+ Simd4i lo = ptr[0], hi = ptr[1];
+ Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo));
+ Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo));
+ Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi));
+ Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi));
+ Simd4i m0123 = select(mSelectD, m23, m01);
+ Simd4i m4567 = select(mSelectD, m67, m45);
+ return select(mSelectQ, m4567, m0123) & ~mOutOfRange;
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h
new file mode 100644
index 00000000..cb141be5
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h
@@ -0,0 +1,392 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+template <bool useMultiplier>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+ const uint16_t* __restrict iIt, __m128 stiffness)
+{
+ __m128 sOne = _mm_set1_ps(1.0f);
+
+ __m128 stretchLimit, compressionLimit, multiplier;
+ if(useMultiplier)
+ {
+ stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+ compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+ multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+ }
+ stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+ for(; rIt != rEnd; rIt += 4, iIt += 8)
+ {
+ float* p0i = posIt + iIt[0] * 4;
+ float* p0j = posIt + iIt[1] * 4;
+ float* p1i = posIt + iIt[2] * 4;
+ float* p1j = posIt + iIt[3] * 4;
+ float* p2i = posIt + iIt[4] * 4;
+ float* p2j = posIt + iIt[5] * 4;
+ float* p3i = posIt + iIt[6] * 4;
+ float* p3j = posIt + iIt[7] * 4;
+
+ __m128 v0i = _mm_load_ps(p0i);
+ __m128 v0j = _mm_load_ps(p0j);
+ __m128 v1i = _mm_load_ps(p1i);
+ __m128 v1j = _mm_load_ps(p1j);
+ __m128 v2i = _mm_load_ps(p2i);
+ __m128 v2j = _mm_load_ps(p2j);
+ __m128 v3i = _mm_load_ps(p3i);
+ __m128 v3j = _mm_load_ps(p3j);
+
+ __m128 h0ij = _mm_add_ps(v0j, _mm_mul_ps(v0i, sMinusOneXYZOneW));
+ __m128 h1ij = _mm_add_ps(v1j, _mm_mul_ps(v1i, sMinusOneXYZOneW));
+ __m128 h2ij = _mm_add_ps(v2j, _mm_mul_ps(v2i, sMinusOneXYZOneW));
+ __m128 h3ij = _mm_add_ps(v3j, _mm_mul_ps(v3i, sMinusOneXYZOneW));
+
+ __m128 a = _mm_unpacklo_ps(h0ij, h2ij);
+ __m128 b = _mm_unpackhi_ps(h0ij, h2ij);
+ __m128 c = _mm_unpacklo_ps(h1ij, h3ij);
+ __m128 d = _mm_unpackhi_ps(h1ij, h3ij);
+
+ __m128 hxij = _mm_unpacklo_ps(a, c);
+ __m128 hyij = _mm_unpackhi_ps(a, c);
+ __m128 hzij = _mm_unpacklo_ps(b, d);
+ __m128 vwij = _mm_unpackhi_ps(b, d);
+
+ __m128 rij = _mm_load_ps(rIt);
+ __m128 e2ij = _mm_add_ps(gSimd4fEpsilon, _mm_add_ps(_mm_mul_ps(hxij, hxij),
+ _mm_add_ps(_mm_mul_ps(hyij, hyij), _mm_mul_ps(hzij, hzij))));
+ __m128 mask = _mm_cmpnle_ps(rij, gSimd4fEpsilon);
+ __m128 erij = _mm_and_ps(_mm_sub_ps(sOne, _mm_mul_ps(rij, _mm_rsqrt_ps(e2ij))), mask);
+
+ if(useMultiplier)
+ {
+ erij = _mm_sub_ps(erij, _mm_mul_ps(multiplier, _mm_max_ps(compressionLimit, _mm_min_ps(erij, stretchLimit))));
+ }
+ __m128 exij = _mm_mul_ps(erij, _mm_mul_ps(stiffness, _mm_rcp_ps(_mm_add_ps(gSimd4fEpsilon, vwij))));
+
+ __m128 exlo = _mm_and_ps(sMaskXY, exij);
+ __m128 exhi = _mm_andnot_ps(sMaskXY, exij);
+
+ __m128 f0ij = _mm_mul_ps(h0ij, _mm_shuffle_ps(exlo, exlo, 0xc0));
+ __m128 f1ij = _mm_mul_ps(h1ij, _mm_shuffle_ps(exlo, exlo, 0xd5));
+ __m128 f2ij = _mm_mul_ps(h2ij, _mm_shuffle_ps(exhi, exhi, 0x2a));
+ __m128 f3ij = _mm_mul_ps(h3ij, _mm_shuffle_ps(exhi, exhi, 0x3f));
+
+ __m128 u0i = _mm_add_ps(v0i, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0i, v0i, 0xff)));
+ __m128 u0j = _mm_sub_ps(v0j, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0j, v0j, 0xff)));
+ __m128 u1i = _mm_add_ps(v1i, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1i, v1i, 0xff)));
+ __m128 u1j = _mm_sub_ps(v1j, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1j, v1j, 0xff)));
+ __m128 u2i = _mm_add_ps(v2i, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2i, v2i, 0xff)));
+ __m128 u2j = _mm_sub_ps(v2j, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2j, v2j, 0xff)));
+ __m128 u3i = _mm_add_ps(v3i, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3i, v3i, 0xff)));
+ __m128 u3j = _mm_sub_ps(v3j, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3j, v3j, 0xff)));
+
+ _mm_store_ps(p0i, u0i);
+ _mm_store_ps(p0j, u0j);
+ _mm_store_ps(p1i, u1i);
+ _mm_store_ps(p1j, u1j);
+ _mm_store_ps(p2i, u2i);
+ _mm_store_ps(p2j, u2j);
+ _mm_store_ps(p3i, u3i);
+ _mm_store_ps(p3j, u3j);
+ }
+}
+
+#if PX_X86
+
+// clang-format:disable
+
+// asm blocks in static condition blocks don't get removed, specialize
+template <>
+void solveConstraints<false>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+ const uint16_t* __restrict iIt, __m128 stiffness)
+{
+ __m128 sOne = _mm_set1_ps(1.0f);
+ __m128 sEpsilon = gSimd4fEpsilon;
+ stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+ __m128 htmp[4];
+ float* ptmp[8];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+ movaps xmm7, sMinusOneXYZOneW
+ mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */
+ mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+ mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+ mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+
+ unpcklps xmm0, xmm2 /* a */
+ unpckhps xmm4, xmm2 /* b */
+ unpcklps xmm1, xmm3 /* c */
+ unpckhps xmm5, xmm3 /* d */
+
+ movaps xmm2, xmm0
+ movaps xmm6, xmm4
+
+ unpcklps xmm0, xmm1 /* hxij */
+ unpckhps xmm2, xmm1 /* hyij */
+ unpcklps xmm4, xmm5 /* hzij */
+ unpckhps xmm6, xmm5 /* vwij */
+
+ movaps xmm7, sEpsilon
+ movaps xmm5, sOne
+ movaps xmm3, stiffness
+ movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+ mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+ mulps xmm2, xmm2 __asm addps xmm0, xmm2
+ mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+ rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+ cmpnleps xmm1, xmm7 /* mask */
+ subps xmm5, xmm0 __asm andps xmm5, xmm1
+ addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+ mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+ movaps xmm7, sMaskXY
+ andps xmm7, xmm6 /* exlo */
+ xorps xmm6, xmm7 /* exhi */
+
+ movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */
+ movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+ movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+ movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+ pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+ pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+ pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+ pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+ mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ add eax, 16
+ add edx, 16
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+}
+
+template <>
+void solveConstraints<true>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+ const uint16_t* __restrict iIt, __m128 stiffness)
+{
+ __m128 sOne = _mm_set1_ps(1.0f);
+ __m128 sEpsilon = gSimd4fEpsilon;
+ __m128 stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+ __m128 compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+ __m128 multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+ stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+ __m128 htmp[4];
+ float* ptmp[8];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+ movaps xmm7, sMinusOneXYZOneW
+ mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */
+ mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+ mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+ mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+ movaps xmm4, xmm0
+ movaps xmm5, xmm1
+
+ unpcklps xmm0, xmm2 /* a */
+ unpckhps xmm4, xmm2 /* b */
+ unpcklps xmm1, xmm3 /* c */
+ unpckhps xmm5, xmm3 /* d */
+
+ movaps xmm2, xmm0
+ movaps xmm6, xmm4
+
+ unpcklps xmm0, xmm1 /* hxij */
+ unpckhps xmm2, xmm1 /* hyij */
+ unpcklps xmm4, xmm5 /* hzij */
+ unpckhps xmm6, xmm5 /* vwij */
+
+ movaps xmm7, sEpsilon
+ movaps xmm5, sOne
+ movaps xmm3, stiffness
+ movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+ mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+ mulps xmm2, xmm2 __asm addps xmm0, xmm2
+ mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+ rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+ cmpnleps xmm1, xmm7 /* mask */
+ subps xmm5, xmm0 __asm andps xmm5, xmm1
+ addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+ movaps xmm0, stretchLimit /* multiplier block */
+ movaps xmm1, compressionLimit
+ movaps xmm2, multiplier
+ minps xmm0, xmm5
+ maxps xmm1, xmm0
+ mulps xmm2, xmm1
+ subps xmm5, xmm2
+
+ mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+ movaps xmm7, sMaskXY
+ andps xmm7, xmm6 /* exlo */
+ xorps xmm6, xmm7 /* exhi */
+
+ movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */
+ movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+ movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+ movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+ pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+ pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+ pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+ pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+ mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+ pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+ movaps XMMWORD PTR [edi + ecx], xmm4
+
+ mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+ pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+ movaps XMMWORD PTR [edi + ecx], xmm6
+
+ add eax, 16
+ add edx, 16
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+}
+
+// clang-format:enable
+
+#endif
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h
new file mode 100644
index 00000000..4f02de76
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h
@@ -0,0 +1,225 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxMemory.h"
+
+#include "SwFactory.h"
+#include "SwFabric.h"
+#include "SwCloth.h"
+
+#include "ClothImpl.h"
+#include "ClothBase.h"
+
+namespace physx
+{
+namespace cloth
+{
+class DxFactory;
+class CuFactory;
+
+// make range from vector
+template <typename T, typename A>
+Range<T> makeRange(shdfnd::Array<T, A>& vec)
+{
+ T* ptr = vec.empty() ? 0 : vec.begin();
+ return Range<T>(ptr, ptr + vec.size());
+}
+
+template <typename T, typename A>
+Range<const T> makeRange(const shdfnd::Array<T, A>& vec)
+{
+ const T* ptr = vec.empty() ? 0 : vec.begin();
+ return Range<const T>(ptr, ptr + vec.size());
+}
+
+// fabric conversion
+template <typename SrcClothType, typename DstFactoryType>
+typename DstFactoryType::FabricType* convertFabric(const SrcClothType& srcFabric, DstFactoryType& dstFactory)
+{
+ typedef typename DstFactoryType::FabricType DstFabricType;
+
+ // see if dstFactory already has a Fabric with this id
+ DstFabricType* const* fIt = dstFactory.mFabrics.begin();
+ DstFabricType* const* fEnd = dstFactory.mFabrics.end();
+ for(; fIt != fEnd; ++fIt)
+ if((*fIt)->mId == srcFabric.mId)
+ return *fIt; // found id, return existing fabric
+
+ // fabric does not exist so create a new one
+ Vector<uint32_t>::Type phases(srcFabric.getNumPhases());
+ Vector<uint32_t>::Type sets(srcFabric.getNumSets());
+ Vector<float>::Type restvalues(srcFabric.getNumRestvalues());
+ Vector<uint32_t>::Type indices(srcFabric.getNumIndices());
+ Vector<uint32_t>::Type anchors(srcFabric.getNumTethers());
+ Vector<float>::Type tetherLengths(srcFabric.getNumTethers());
+ Vector<uint32_t>::Type triangles(srcFabric.getNumTriangles() * 3);
+
+ Range<uint32_t> phaseRange = makeRange(phases);
+ Range<float> restvalueRange = makeRange(restvalues);
+ Range<uint32_t> setRange = makeRange(sets);
+ Range<uint32_t> indexRange = makeRange(indices);
+ Range<uint32_t> anchorRange = makeRange(anchors);
+ Range<float> lengthRange = makeRange(tetherLengths);
+ Range<uint32_t> triangleRange = makeRange(triangles);
+
+ srcFabric.mFactory.extractFabricData(srcFabric, phaseRange, setRange, restvalueRange, indexRange, anchorRange,
+ lengthRange, triangleRange);
+
+ DstFabricType* dstFabric =
+ static_cast<DstFabricType*>(dstFactory.createFabric(srcFabric.mNumParticles, phaseRange, setRange, restvalueRange,
+ indexRange, anchorRange, lengthRange, triangleRange));
+
+ // give new fabric the same id as the source so it can be matched
+ dstFabric->mId = srcFabric.mId;
+
+ return dstFabric;
+}
+
+inline Range<const PhaseConfig> getPhaseConfigs(const SwCloth& cloth)
+{
+ return makeRange(cloth.mPhaseConfigs);
+}
+inline void setPhaseConfigs(SwCloth& cloth, Range<const PhaseConfig> phaseConfigs)
+{
+ cloth.mPhaseConfigs.assign(phaseConfigs.begin(), phaseConfigs.end());
+}
+inline Range<const PxVec4> getParticleAccelerations(const SwCloth& cloth)
+{
+ return makeRange(cloth.mParticleAccelerations);
+}
+inline Range<const uint32_t> getSelfCollisionIndices(const SwCloth& cloth)
+{
+ return makeRange(cloth.mSelfCollisionIndices);
+}
+
+// cloth conversion
+template <typename DstFactoryType, typename SrcImplType>
+typename DstFactoryType::ImplType* convertCloth(DstFactoryType& dstFactory, const SrcImplType& srcImpl)
+{
+ typedef typename DstFactoryType::FabricType DstFabricType;
+ typedef typename DstFactoryType::ImplType DstImplType;
+ typedef typename DstImplType::ClothType DstClothType;
+ typedef typename SrcImplType::ClothType SrcClothType;
+
+ const SrcClothType& srcCloth = srcImpl.mCloth;
+ const Factory& srcFactory = srcCloth.mFactory;
+
+ typename DstClothType::ContextLockType dstLock(dstFactory);
+ typename SrcClothType::ContextLockType srcLock(srcCloth.mFactory);
+
+ // particles
+ MappedRange<const PxVec4> curParticles = srcImpl.getCurrentParticles();
+
+ // fabric
+ DstFabricType& dstFabric = *convertFabric(srcCloth.mFabric, dstFactory);
+
+ // create new cloth
+ DstImplType* dstImpl = static_cast<DstImplType*>(dstFactory.createCloth(curParticles, dstFabric));
+ DstClothType& dstCloth = dstImpl->mCloth;
+
+ // copy across common parameters
+ copy(dstCloth, srcCloth);
+
+ // copy across previous particles
+ MappedRange<const PxVec4> prevParticles = srcImpl.getPreviousParticles();
+ PxMemCopy(dstImpl->getPreviousParticles().begin(), prevParticles.begin(), prevParticles.size() * sizeof(PxVec4));
+
+ // copy across transformed phase configs
+ setPhaseConfigs(dstCloth, getPhaseConfigs(srcCloth));
+
+ // collision data
+ Vector<PxVec4>::Type spheres(srcImpl.getNumSpheres(), PxVec4(0.0f));
+ PxVec4* spherePtr = spheres.empty() ? 0 : &spheres.front();
+ Range<PxVec4> sphereRange(spherePtr, spherePtr + spheres.size());
+ Vector<uint32_t>::Type capsules(srcImpl.getNumCapsules() * 2);
+ Range<uint32_t> capsuleRange = makeRange(capsules);
+ Vector<PxVec4>::Type planes(srcImpl.getNumPlanes(), PxVec4(0.0f));
+ PxVec4* planePtr = planes.empty() ? 0 : &planes.front();
+ Range<PxVec4> planeRange(planePtr, planePtr + planes.size());
+ Vector<uint32_t>::Type convexes(srcImpl.getNumConvexes());
+ Range<uint32_t> convexRange = makeRange(convexes);
+ Vector<PxVec3>::Type triangles(srcImpl.getNumTriangles() * 3, PxVec3(0.0f));
+ PxVec3* trianglePtr = triangles.empty() ? 0 : &triangles.front();
+ Range<PxVec3> triangleRange(trianglePtr, trianglePtr + triangles.size());
+
+ srcFactory.extractCollisionData(srcImpl, sphereRange, capsuleRange, planeRange, convexRange, triangleRange);
+ dstImpl->setSpheres(sphereRange, 0, 0);
+ dstImpl->setCapsules(capsuleRange, 0, 0);
+ dstImpl->setPlanes(planeRange, 0, 0);
+ dstImpl->setConvexes(convexRange, 0, 0);
+ dstImpl->setTriangles(triangleRange, 0, 0);
+
+ // motion constraints, copy directly into new cloth buffer
+ if(srcImpl.getNumMotionConstraints())
+ srcFactory.extractMotionConstraints(srcImpl, dstImpl->getMotionConstraints());
+
+ // separation constraints, copy directly into new cloth buffer
+ if(srcImpl.getNumSeparationConstraints())
+ srcFactory.extractSeparationConstraints(srcImpl, dstImpl->getSeparationConstraints());
+
+ // particle accelerations
+ if(srcImpl.getNumParticleAccelerations())
+ {
+ Range<const PxVec4> accelerations = getParticleAccelerations(srcCloth);
+ PxMemCopy(dstImpl->getParticleAccelerations().begin(), accelerations.begin(),
+ accelerations.size() * sizeof(PxVec4));
+ }
+
+ // self-collision indices
+ dstImpl->setSelfCollisionIndices(getSelfCollisionIndices(srcCloth));
+
+ // rest positions
+ Vector<PxVec4>::Type restPositions(srcImpl.getNumRestPositions());
+ srcFactory.extractRestPositions(srcImpl, makeRange(restPositions));
+ dstImpl->setRestPositions(makeRange(restPositions));
+
+ // virtual particles
+ if(srcImpl.getNumVirtualParticles())
+ {
+ Vector<Vec4u>::Type indices(srcImpl.getNumVirtualParticles());
+ Vector<PxVec3>::Type weights(srcImpl.getNumVirtualParticleWeights(), PxVec3(0.0f));
+
+ uint32_t(*indicesPtr)[4] = indices.empty() ? 0 : &array(indices.front());
+ Range<uint32_t[4]> indicesRange(indicesPtr, indicesPtr + indices.size());
+
+ PxVec3* weightsPtr = weights.empty() ? 0 : &weights.front();
+ Range<PxVec3> weightsRange(weightsPtr, weightsPtr + weights.size());
+
+ srcFactory.extractVirtualParticles(srcImpl, indicesRange, weightsRange);
+
+ dstImpl->setVirtualParticles(indicesRange, weightsRange);
+ }
+
+ return dstImpl;
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h
new file mode 100644
index 00000000..b9ae0a53
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h
@@ -0,0 +1,45 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <cuda.h>
+#include <driver_types.h>
+
+namespace physx
+{
+namespace cloth
+{
+// implemented in CuFactory.cpp
+void checkSuccessImpl(CUresult, const char*, const int);
+}
+
+// safe cuda calls
+#define checkSuccess(err) cloth::checkSuccessImpl(err, __FILE__, __LINE__)
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp
new file mode 100644
index 00000000..6ecd1aeb
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp
@@ -0,0 +1,511 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuCloth.h"
+#include "CuFabric.h"
+#include "CuFactory.h"
+#include "CuContextLock.h"
+#include "CuCheckSuccess.h"
+#include "CuClothData.h"
+#include "CuSolver.h"
+#include "TripletScheduler.h"
+#include "ClothBase.h"
+#include "Array.h"
+#include "PsFoundation.h"
+
+#if PX_VC
+#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch
+#endif
+
+namespace physx
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp
+}
+}
+
+using namespace physx;
+
+namespace
+{
+bool isSelfCollisionEnabled(const cloth::CuCloth& cloth)
+{
+ return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f;
+}
+}
+
+cloth::CuCloth::CuCloth(CuFactory& factory, CuFabric& fabric, Range<const PxVec4> particles)
+: CuContextLock(factory)
+, mFactory(factory)
+, mFabric(fabric)
+, mClothDataDirty(false)
+, mNumParticles(uint32_t(particles.size()))
+, mParticles(mFactory.mContextManager)
+, mParticlesHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped))
+, mDeviceParticlesDirty(false)
+, mHostParticlesDirty(true)
+, mPhaseConfigs(mFactory.mContextManager)
+, mMotionConstraints(mFactory.mContextManager)
+, mSeparationConstraints(mFactory.mContextManager)
+, mParticleAccelerations(mFactory.mContextManager)
+, mParticleAccelerationsHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped))
+, mCapsuleIndices(getMappedAllocator<IndexPair>(mFactory.mContextManager))
+, mStartCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mTargetCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mConvexMasks(getMappedAllocator<uint32_t>(mFactory.mContextManager))
+, mStartCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mTargetCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager))
+, mStartCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager))
+, mTargetCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager))
+, mVirtualParticleSetSizes(mFactory.mContextManager)
+, mVirtualParticleIndices(mFactory.mContextManager)
+, mVirtualParticleWeights(mFactory.mContextManager)
+, mRestPositions(mFactory.mContextManager)
+, mSelfCollisionIndices(mFactory.mContextManager)
+, mSelfCollisionData(mFactory.mContextManager)
+, mSharedMemorySize(0)
+, mUserData(0)
+{
+ PX_ASSERT(!particles.empty());
+
+ initialize(*this, particles.begin(), particles.end());
+
+ mParticles.reserve(2 * mNumParticles);
+ mParticles.push_back(particles.begin(), particles.end());
+ mParticles.push_back(particles.begin(), particles.end());
+ mParticlesHostCopy.resizeUninitialized(2 * mNumParticles);
+
+ mFabric.incRefCount();
+
+ CuContextLock::release();
+}
+
+cloth::CuCloth::CuCloth(CuFactory& factory, const CuCloth& cloth)
+: CuContextLock(factory)
+, mFactory(factory)
+, mFabric(cloth.mFabric)
+, mNumParticles(cloth.mNumParticles)
+, mParticles(cloth.mParticles)
+, mParticlesHostCopy(cloth.mParticlesHostCopy)
+, mDeviceParticlesDirty(cloth.mDeviceParticlesDirty)
+, mHostParticlesDirty(cloth.mHostParticlesDirty)
+, mPhaseConfigs(cloth.mPhaseConfigs)
+, mHostPhaseConfigs(cloth.mHostPhaseConfigs)
+, mMotionConstraints(cloth.mMotionConstraints)
+, mSeparationConstraints(cloth.mSeparationConstraints)
+, mParticleAccelerations(cloth.mParticleAccelerations)
+, mParticleAccelerationsHostCopy(cloth.mParticleAccelerationsHostCopy)
+, mCapsuleIndices(cloth.mCapsuleIndices)
+, mStartCollisionSpheres(cloth.mStartCollisionSpheres)
+, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres)
+, mStartCollisionPlanes(cloth.mStartCollisionPlanes)
+, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes)
+, mStartCollisionTriangles(cloth.mStartCollisionTriangles)
+, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles)
+, mVirtualParticleSetSizes(cloth.mVirtualParticleSetSizes)
+, mVirtualParticleIndices(cloth.mVirtualParticleIndices)
+, mVirtualParticleWeights(cloth.mVirtualParticleWeights)
+, mRestPositions(cloth.mRestPositions)
+, mSelfCollisionIndices(cloth.mSelfCollisionIndices)
+, mSelfCollisionData(mFactory.mContextManager)
+, mSharedMemorySize(cloth.mSharedMemorySize)
+, mUserData(cloth.mUserData)
+{
+ copy(*this, cloth);
+
+ mFabric.incRefCount();
+
+ CuContextLock::release();
+}
+
+cloth::CuCloth::~CuCloth()
+{
+ CuContextLock::acquire();
+
+ mFabric.decRefCount();
+}
+
+void cloth::CuCloth::notifyChanged()
+{
+ mClothDataDirty = true;
+}
+
+bool cloth::CuCloth::updateClothData(CuClothData& clothData)
+{
+ // test particle pointer to detect when cloth data array has been reordered
+ if(!mClothDataDirty && clothData.mParticles == array(*mParticles.begin().get()))
+ {
+ PX_ASSERT(mSharedMemorySize == getSharedMemorySize());
+ return false;
+ }
+
+ mSharedMemorySize = getSharedMemorySize();
+
+ if(mSelfCollisionData.empty() && isSelfCollisionEnabled(*this))
+ {
+ uint32_t numSelfCollisionIndices =
+ mSelfCollisionIndices.empty() ? mNumParticles : uint32_t(mSelfCollisionIndices.size());
+
+ uint32_t particleSize = 4 * mNumParticles;
+ uint32_t keySize = 2 * numSelfCollisionIndices; // 2x for radix buffer
+ uint32_t cellStartSize = (129 + 128 * 128 + 130) / 2 + 1; // half because type is int16_t
+
+ // use 16bit indices for cellStart array (128x128 grid)
+ mSelfCollisionData.resize(particleSize + keySize + cellStartSize);
+ checkSuccess(cuMemsetD32((mSelfCollisionData.begin() + particleSize + keySize).dev(), 0xffffffff, cellStartSize));
+ }
+
+ clothData = CuClothData(*this);
+ mClothDataDirty = false;
+
+ return true;
+}
+
+uint32_t cloth::CuCloth::getSharedMemorySize() const
+{
+ uint32_t numPhases = uint32_t(mPhaseConfigs.size());
+ uint32_t numSpheres = uint32_t(mStartCollisionSpheres.size());
+ uint32_t numCones = uint32_t(mCapsuleIndices.size());
+ uint32_t numPlanes = uint32_t(mStartCollisionPlanes.size());
+ uint32_t numConvexes = uint32_t(mConvexMasks.size());
+ uint32_t numTriangles = uint32_t(mStartCollisionTriangles.size() / 3);
+
+ uint32_t phaseConfigSize = numPhases * sizeof(CuPhaseConfig);
+
+ bool storePrevCollisionData = mEnableContinuousCollision || mFriction > 0.0f;
+ uint32_t continuousCollisionSize = storePrevCollisionData ? 4 * numSpheres + 10 * numCones : 0;
+ continuousCollisionSize += 4 * numCones + numConvexes; // capsule and convex masks
+ uint32_t discreteCollisionSize = 4 * numSpheres + PxMax(10 * numCones + 96, 208u);
+ discreteCollisionSize = PxMax(discreteCollisionSize, PxMax(4 * numPlanes, 19 * numTriangles));
+
+ // scratch memory for prefix sum and histogram
+ uint32_t selfCollisionSize = isSelfCollisionEnabled(*this) ? 544 : 0;
+
+ // see CuSolverKenel.cu::gSharedMemory comment for details
+ return phaseConfigSize + sizeof(float) * (continuousCollisionSize + PxMax(selfCollisionSize, discreteCollisionSize));
+}
+
+void cloth::CuCloth::setPhaseConfig(Range<const PhaseConfig> configs)
+{
+ mHostPhaseConfigs.assign(configs.begin(), configs.end());
+
+ Vector<CuPhaseConfig>::Type deviceConfigs;
+ deviceConfigs.reserve(configs.size());
+ const PhaseConfig* cEnd = configs.end();
+ for(const PhaseConfig* cIt = configs.begin(); cIt != cEnd; ++cIt)
+ {
+ CuPhaseConfig config;
+
+ config.mStiffness = cIt->mStiffness;
+ config.mStiffnessMultiplier = cIt->mStiffnessMultiplier;
+ config.mCompressionLimit = cIt->mCompressionLimit;
+ config.mStretchLimit = cIt->mStretchLimit;
+
+ uint16_t phaseIndex = cIt->mPhaseIndex;
+ config.mNumConstraints = mFabric.mNumConstraintsInPhase[phaseIndex];
+ config.mRestvalues = mFabric.mRestvaluesInPhase[phaseIndex].get();
+ config.mIndices = mFabric.mIndicesInPhase[phaseIndex].get();
+
+ deviceConfigs.pushBack(config);
+ }
+
+ CuContextLock contextLock(mFactory);
+ mPhaseConfigs.assign(deviceConfigs.begin(), deviceConfigs.end());
+}
+
+cloth::Range<PxVec4> cloth::CuCloth::push(cloth::CuConstraints& constraints)
+{
+ if(!constraints.mTarget.capacity())
+ {
+ CuContextLock contextLock(mFactory);
+ constraints.mTarget.reserve(mNumParticles);
+ }
+ if(constraints.mHostCopy.empty())
+ constraints.mTarget.resize(mNumParticles);
+
+ if(constraints.mStart.empty()) // initialize start first
+ constraints.mStart.swap(constraints.mTarget);
+
+ if(!constraints.mHostCopy.capacity())
+ {
+ CuContextLock contextLock(mFactory);
+ constraints.mHostCopy.reserve(mNumParticles);
+ }
+ constraints.mHostCopy.resizeUninitialized(mNumParticles);
+
+ PxVec4* data = &constraints.mHostCopy.front();
+ return Range<PxVec4>(data, data + constraints.mHostCopy.size());
+}
+
+void cloth::CuCloth::clear(cloth::CuConstraints& constraints)
+{
+ CuContextLock contextLock(mFactory);
+ CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mStart);
+ CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mTarget);
+}
+
+void cloth::CuCloth::syncDeviceParticles()
+{
+ if(mDeviceParticlesDirty)
+ {
+ CuContextLock contextLock(mFactory);
+ checkSuccess(
+ cuMemcpyHtoD(mParticles.begin().dev(), mParticlesHostCopy.begin(), 2 * mNumParticles * sizeof(PxVec4)));
+ mDeviceParticlesDirty = false;
+ }
+}
+
+void cloth::CuCloth::syncHostParticles()
+{
+ if(mHostParticlesDirty)
+ {
+ CuContextLock contextLock(mFactory);
+ const PxVec4* src = mParticles.begin().get();
+ mFactory.copyToHost(src, src + 2 * mNumParticles, mParticlesHostCopy.begin());
+ mHostParticlesDirty = false;
+ }
+}
+
+cloth::Range<const PxVec3> cloth::CuCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t replaceSize)
+{
+ // clamp to 500 triangles (1500 vertices) to prevent running out of shared memory
+ uint32_t removedSize = mStartCollisionTriangles.size() - replaceSize;
+ const PxVec3* clamp = range.begin() + 1500 - removedSize;
+
+ if(range.end() > clamp)
+ {
+ shdfnd::getFoundation().error(PX_WARN, "Too many collision "
+ "triangles specified for cloth, dropping all but first 500.\n");
+ }
+
+ return Range<const PxVec3>(range.begin(), PxMin(range.end(), clamp));
+}
+
+#include "ClothImpl.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+// ClothImpl<CuCloth>::clone() implemented in CuClothClone.cpp
+
+template <>
+uint32_t ClothImpl<CuCloth>::getNumParticles() const
+{
+ return mCloth.mNumParticles;
+}
+
+template <>
+void ClothImpl<CuCloth>::lockParticles() const
+{
+ const_cast<CuCloth&>(mCloth).syncHostParticles();
+}
+
+template <>
+void ClothImpl<CuCloth>::unlockParticles() const
+{
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<CuCloth>::getCurrentParticles()
+{
+ mCloth.wakeUp();
+ lockParticles();
+ mCloth.mDeviceParticlesDirty = true;
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<CuCloth>::getCurrentParticles() const
+{
+ lockParticles();
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin());
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<CuCloth>::getPreviousParticles()
+{
+ mCloth.wakeUp();
+ lockParticles();
+ mCloth.mDeviceParticlesDirty = true;
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles);
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<CuCloth>::getPreviousParticles() const
+{
+ lockParticles();
+ return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles);
+}
+
+template <>
+GpuParticles ClothImpl<CuCloth>::getGpuParticles()
+{
+ mCloth.syncDeviceParticles();
+ mCloth.mHostParticlesDirty = true;
+ PxVec4* particles = mCloth.mParticles.begin().get();
+ GpuParticles result = { particles, particles + mCloth.mNumParticles, 0 };
+ return result;
+}
+
+template <>
+void ClothImpl<CuCloth>::setPhaseConfig(Range<const PhaseConfig> configs)
+{
+ Vector<PhaseConfig>::Type transformedConfigs;
+ transformedConfigs.reserve(configs.size());
+
+ // transform phase config to use in solver
+ for(; !configs.empty(); configs.popFront())
+ if(configs.front().mStiffness > 0.0f)
+ transformedConfigs.pushBack(transform(configs.front()));
+
+ mCloth.setPhaseConfig(Range<const PhaseConfig>(transformedConfigs.begin(), transformedConfigs.end()));
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<CuCloth>::setSelfCollisionIndices(Range<const uint32_t> indices)
+{
+ ContextLockType lock(mCloth.mFactory);
+ mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end());
+ mCloth.mSelfCollisionIndicesHost.assign(indices.begin(), indices.end());
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+template <>
+uint32_t ClothImpl<CuCloth>::getNumVirtualParticles() const
+{
+ return uint32_t(mCloth.mVirtualParticleIndices.size());
+}
+
+template <>
+Range<PxVec4> ClothImpl<CuCloth>::getParticleAccelerations()
+{
+ if(mCloth.mParticleAccelerations.empty())
+ {
+ CuContextLock contextLock(mCloth.mFactory);
+ mCloth.mParticleAccelerations.resize(mCloth.mNumParticles);
+ }
+
+ if(!mCloth.mParticleAccelerationsHostCopy.capacity())
+ {
+ CuContextLock contextLock(mCloth.mFactory);
+ mCloth.mParticleAccelerationsHostCopy.reserve(mCloth.mNumParticles);
+ }
+ mCloth.mParticleAccelerationsHostCopy.resizeUninitialized(mCloth.mNumParticles);
+
+ mCloth.wakeUp();
+
+ PxVec4* data = mCloth.mParticleAccelerationsHostCopy.begin();
+ return Range<PxVec4>(data, mCloth.mParticleAccelerationsHostCopy.end());
+}
+
+template <>
+void ClothImpl<CuCloth>::clearParticleAccelerations()
+{
+ CuContextLock contextLock(mCloth.mFactory);
+ CuDeviceVector<PxVec4>(mCloth.mFactory.mContextManager).swap(mCloth.mParticleAccelerations);
+ mCloth.mParticleAccelerationsHostCopy.reset();
+ mCloth.wakeUp();
+}
+
+namespace
+{
+uint32_t calculateNumReplays(const Vector<Vec4u>::Type& triplets, const Vector<uint32_t>::Type setSizes)
+{
+ uint32_t result = 0;
+
+ Vector<Vec4u>::Type::ConstIterator tIt = triplets.begin();
+ Vector<uint32_t>::Type::ConstIterator sIt, sEnd = setSizes.end();
+ uint32_t index = 0;
+ for(sIt = setSizes.begin(); sIt != sEnd; ++sIt, ++index)
+ {
+ Vector<Vec4u>::Type::ConstIterator tEnd = tIt + *sIt, tLast = tIt;
+ while(tLast != tEnd)
+ {
+ uint8_t numConflicts[3][32] = {};
+ uint8_t numReplays[3] = {};
+
+ for(tLast += PxMin(ptrdiff_t(32), tEnd - tLast); tIt != tLast; ++tIt)
+ for(int i = 0; i < 3; ++i)
+ numReplays[i] = PxMax(numReplays[i], ++numConflicts[i][(*tIt)[i] & 31]);
+
+ result += numReplays[0] + numReplays[1] + numReplays[2];
+ }
+ }
+
+ return result;
+}
+}
+
+template <>
+void ClothImpl<CuCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights)
+{
+ // shuffle indices to form independent SIMD sets
+ TripletScheduler scheduler(indices);
+ scheduler.warp(mCloth.mNumParticles, 32);
+
+ // convert to 16bit indices
+ Vector<Vec4us>::Type hostIndices;
+ hostIndices.reserve(indices.size());
+ TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin();
+ TripletScheduler::ConstTripletIter tEnd = scheduler.mTriplets.end();
+ for(; tIt != tEnd; ++tIt)
+ hostIndices.pushBack(Vec4us(*tIt));
+
+ // printf("num sets = %u, num replays = %u\n", scheduler.mSetSizes.size(),
+ // calculateNumReplays(scheduler.mTriplets, scheduler.mSetSizes));
+
+ // add normalization weight
+ Vector<PxVec4>::Type hostWeights;
+ hostWeights.reserve(weights.size());
+ for(; !weights.empty(); weights.popFront())
+ {
+ PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front());
+ PxReal scale = 1 / w.magnitudeSquared();
+ hostWeights.pushBack(PxVec4(w.x, w.y, w.z, scale));
+ }
+
+ CuContextLock contextLock(mCloth.mFactory);
+
+ // todo: 'swap' these to force reallocation?
+ mCloth.mVirtualParticleIndices = hostIndices;
+ mCloth.mVirtualParticleSetSizes = scheduler.mSetSizes;
+ mCloth.mVirtualParticleWeights = hostWeights;
+
+ mCloth.notifyChanged();
+ mCloth.wakeUp();
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h
new file mode 100644
index 00000000..257d490c
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h
@@ -0,0 +1,216 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxTransform.h"
+#include "foundation/PxVec4.h"
+#include "Range.h"
+#include "PhaseConfig.h"
+#include "MovingAverage.h"
+#include "IndexPair.h"
+#include "BoundingBox.h"
+#include "Vec4T.h"
+#include "CuPhaseConfig.h"
+#include "CuPinnedAllocator.h"
+#include "CuContextLock.h"
+#include "CuDeviceVector.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+class CuFabric;
+class CuFactory;
+struct CuClothData;
+
+struct CuConstraints
+{
+ CuConstraints(physx::PxCudaContextManager* ctx)
+ : mStart(ctx), mTarget(ctx), mHostCopy(CuHostAllocator(ctx, cudaHostAllocMapped))
+ {
+ }
+
+ void pop()
+ {
+ if(!mTarget.empty())
+ {
+ mStart.swap(mTarget);
+ mTarget.resize(0);
+ }
+ }
+
+ CuDeviceVector<PxVec4> mStart;
+ CuDeviceVector<PxVec4> mTarget;
+ CuPinnedVector<PxVec4>::Type mHostCopy;
+};
+
+class CuCloth : protected CuContextLock
+{
+ public:
+ CuCloth& operator=(const CuCloth&);
+ typedef CuFactory FactoryType;
+ typedef CuFabric FabricType;
+ typedef CuContextLock ContextLockType;
+
+ typedef CuPinnedVector<PxVec4>::Type& MappedVec4fVectorType;
+ typedef CuPinnedVector<IndexPair>::Type& MappedIndexVectorType;
+
+ CuCloth(CuFactory&, CuFabric&, Range<const PxVec4>);
+ CuCloth(CuFactory&, const CuCloth&);
+ ~CuCloth(); // not virtual on purpose
+
+ public:
+ bool isSleeping() const
+ {
+ return mSleepPassCounter >= mSleepAfterCount;
+ }
+ void wakeUp()
+ {
+ mSleepPassCounter = 0;
+ }
+
+ void notifyChanged();
+
+ bool updateClothData(CuClothData&); // expects acquired context
+ uint32_t getSharedMemorySize() const; // without particle data
+
+ // expects transformed configs, doesn't call notifyChanged()
+ void setPhaseConfig(Range<const PhaseConfig>);
+
+ Range<PxVec4> push(CuConstraints&);
+ void clear(CuConstraints&);
+
+ void syncDeviceParticles();
+ void syncHostParticles();
+
+ Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t);
+
+ public:
+ CuFactory& mFactory;
+ CuFabric& mFabric;
+
+ bool mClothDataDirty;
+
+ // particle data
+ uint32_t mNumParticles;
+ CuDeviceVector<PxVec4> mParticles; // cur, prev
+ CuPinnedVector<PxVec4>::Type mParticlesHostCopy;
+ bool mDeviceParticlesDirty;
+ bool mHostParticlesDirty;
+
+ PxVec3 mParticleBoundsCenter;
+ PxVec3 mParticleBoundsHalfExtent;
+
+ PxVec3 mGravity;
+ PxVec3 mLogDamping;
+ PxVec3 mLinearLogDrag;
+ PxVec3 mAngularLogDrag;
+ PxVec3 mLinearInertia;
+ PxVec3 mAngularInertia;
+ PxVec3 mCentrifugalInertia;
+ float mSolverFrequency;
+ float mStiffnessFrequency;
+
+ PxTransform mTargetMotion;
+ PxTransform mCurrentMotion;
+ PxVec3 mLinearVelocity;
+ PxVec3 mAngularVelocity;
+
+ float mPrevIterDt;
+ MovingAverage mIterDtAvg;
+
+ CuDeviceVector<CuPhaseConfig> mPhaseConfigs; // transformed!
+ Vector<PhaseConfig>::Type mHostPhaseConfigs; // transformed!
+
+ // tether constraints stuff
+ float mTetherConstraintLogStiffness;
+ float mTetherConstraintScale;
+
+ // motion constraints stuff
+ CuConstraints mMotionConstraints;
+ float mMotionConstraintScale;
+ float mMotionConstraintBias;
+ float mMotionConstraintLogStiffness;
+
+ // separation constraints stuff
+ CuConstraints mSeparationConstraints;
+
+ // particle acceleration stuff
+ CuDeviceVector<PxVec4> mParticleAccelerations;
+ CuPinnedVector<PxVec4>::Type mParticleAccelerationsHostCopy;
+
+ // wind
+ PxVec3 mWind;
+ float mDragLogCoefficient;
+ float mLiftLogCoefficient;
+
+ // collision stuff
+ CuPinnedVector<IndexPair>::Type mCapsuleIndices;
+ CuPinnedVector<PxVec4>::Type mStartCollisionSpheres;
+ CuPinnedVector<PxVec4>::Type mTargetCollisionSpheres;
+ CuPinnedVector<uint32_t>::Type mConvexMasks;
+ CuPinnedVector<PxVec4>::Type mStartCollisionPlanes;
+ CuPinnedVector<PxVec4>::Type mTargetCollisionPlanes;
+ CuPinnedVector<PxVec3>::Type mStartCollisionTriangles;
+ CuPinnedVector<PxVec3>::Type mTargetCollisionTriangles;
+ bool mEnableContinuousCollision;
+ float mCollisionMassScale;
+ float mFriction;
+
+ // virtual particles
+ CuDeviceVector<uint32_t> mVirtualParticleSetSizes;
+ CuDeviceVector<Vec4us> mVirtualParticleIndices;
+ CuDeviceVector<PxVec4> mVirtualParticleWeights;
+
+ // self collision
+ float mSelfCollisionDistance;
+ float mSelfCollisionLogStiffness;
+
+ CuDeviceVector<PxVec4> mRestPositions;
+ CuDeviceVector<uint32_t> mSelfCollisionIndices;
+ Vector<uint32_t>::Type mSelfCollisionIndicesHost;
+
+ // 4 (position) + 2 (key) per particle + cellStart (8322)
+ CuDeviceVector<float> mSelfCollisionData;
+
+ // sleeping (see SwCloth for comments)
+ uint32_t mSleepTestInterval;
+ uint32_t mSleepAfterCount;
+ float mSleepThreshold;
+ uint32_t mSleepPassCounter;
+ uint32_t mSleepTestCounter;
+
+ uint32_t mSharedMemorySize;
+
+ void* mUserData;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp
new file mode 100644
index 00000000..8b234968
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp
@@ -0,0 +1,83 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "ClothClone.h"
+
+#include "CuFactory.h"
+#include "CuFabric.h"
+#include "CuCloth.h"
+
+namespace physx
+{
+namespace cloth
+{
+Range<const PhaseConfig> getPhaseConfigs(const CuCloth& cloth)
+{
+ return makeRange(cloth.mHostPhaseConfigs);
+}
+void setPhaseConfigs(CuCloth& cloth, Range<const PhaseConfig> phaseConfigs)
+{
+ cloth.setPhaseConfig(phaseConfigs);
+}
+Range<const PxVec4> getParticleAccelerations(const CuCloth& cloth)
+{
+ return makeRange(cloth.mParticleAccelerationsHostCopy);
+}
+Range<const uint32_t> getSelfCollisionIndices(const CuCloth& cloth)
+{
+ return makeRange(cloth.mSelfCollisionIndicesHost);
+}
+
+template <>
+Cloth* ClothImpl<CuCloth>::clone(Factory& factory) const
+{
+ if(&mCloth.mFactory == &factory)
+ return new ClothImpl<CuCloth>(factory, *this); // copy construct directly
+
+ switch(factory.getPlatform())
+ {
+ case Factory::CPU:
+ return convertCloth(static_cast<SwFactory&>(factory), *this);
+ case Factory::CUDA:
+ return convertCloth(static_cast<CuFactory&>(factory), *this);
+ default:
+ return NULL;
+ }
+}
+
+Cloth* CuFactory::clone(const Cloth& cloth)
+{
+ if(cloth.getFactory().getPlatform() == Factory::CPU)
+ return convertCloth(*this, static_cast<const SwClothImpl&>(cloth));
+
+ return cloth.clone(*this);
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp
new file mode 100644
index 00000000..5a1485c6
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp
@@ -0,0 +1,238 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuClothData.h"
+#include "CuCloth.h"
+#include "CuFabric.h"
+#include "CuCheckSuccess.h"
+#include "CuContextLock.h"
+#include "IterationState.h"
+
+using namespace physx;
+
+cloth::CuClothData::CuClothData(CuCloth& cloth)
+{
+ mNumParticles = cloth.mNumParticles;
+ mParticles = array(*cloth.mParticles.begin().get());
+
+ mParticlesHostCopy = array(*getDevicePointer(cloth.mParticlesHostCopy));
+
+ mNumPhases = uint32_t(cloth.mPhaseConfigs.size());
+ mPhaseConfigs = cloth.mPhaseConfigs.begin().get();
+
+ mTethers = cloth.mFabric.mTethers.begin().get();
+ mNumTethers = uint32_t(cloth.mFabric.mTethers.size());
+ mTetherConstraintScale = cloth.mTetherConstraintScale * cloth.mFabric.mTetherLengthScale;
+
+ mTriangles = cloth.mFabric.mTriangles.begin().get();
+ mNumTriangles = uint32_t(cloth.mFabric.mTriangles.size()) / 3;
+
+ mMotionConstraintScale = cloth.mMotionConstraintScale;
+ mMotionConstraintBias = cloth.mMotionConstraintBias;
+
+ mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size());
+ mNumCapsules = uint32_t(cloth.mCapsuleIndices.size());
+ mCapsuleIndices = getDevicePointer(cloth.mCapsuleIndices);
+
+ mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size());
+ mNumConvexes = uint32_t(cloth.mConvexMasks.size());
+ mConvexMasks = getDevicePointer(cloth.mConvexMasks);
+
+ mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3;
+
+ mVirtualParticleSetSizesBegin = cloth.mVirtualParticleSetSizes.begin().get();
+ mVirtualParticleSetSizesEnd = mVirtualParticleSetSizesBegin + cloth.mVirtualParticleSetSizes.size();
+ mVirtualParticleIndices = array(*cloth.mVirtualParticleIndices.begin().get());
+ mVirtualParticleWeights = array(*cloth.mVirtualParticleWeights.begin().get());
+
+ mEnableContinuousCollision = cloth.mEnableContinuousCollision;
+ mCollisionMassScale = cloth.mCollisionMassScale;
+ mFrictionScale = cloth.mFriction;
+
+ mSelfCollisionDistance = cloth.mSelfCollisionDistance;
+ mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin().get();
+ mNumSelfCollisionIndices = mSelfCollisionIndices ? uint32_t(cloth.mSelfCollisionIndices.size()) : mNumParticles;
+
+ if(!cloth.mSelfCollisionData.empty())
+ {
+ uint32_t keySize = 2 * mNumSelfCollisionIndices;
+ uint32_t particleSize = 4 * mNumParticles;
+
+ mSelfCollisionParticles = cloth.mSelfCollisionData.begin().get();
+ mSelfCollisionKeys = (uint32_t*)(mSelfCollisionParticles + particleSize);
+ mSelfCollisionCellStart = (uint16_t*)(mSelfCollisionKeys + keySize);
+ }
+ else
+ {
+ mSelfCollisionParticles = 0;
+ mSelfCollisionKeys = 0;
+ mSelfCollisionCellStart = 0;
+ }
+
+ mSleepTestInterval = cloth.mSleepTestInterval;
+ mSleepAfterCount = cloth.mSleepAfterCount;
+ mSleepThreshold = cloth.mSleepThreshold;
+}
+
+cloth::CuFrameData::CuFrameData(CuCloth& cloth, uint32_t numSharedPositions, const IterationState<Simd4f>& state,
+ const CuIterationData* iterationData)
+{
+ mDeviceParticlesDirty = cloth.mDeviceParticlesDirty;
+
+ mNumSharedPositions = numSharedPositions;
+
+ mIterDt = state.mIterDt;
+ mNumIterations = state.mRemainingIterations;
+ mIterationData = iterationData;
+
+ Simd4f logStiffness = simd4f(0.0f, cloth.mSelfCollisionLogStiffness, cloth.mMotionConstraintLogStiffness,
+ cloth.mTetherConstraintLogStiffness);
+ Simd4f stiffnessExponent = simd4f(cloth.mStiffnessFrequency * mIterDt);
+ Simd4f stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent);
+
+ mTetherConstraintStiffness = array(stiffness)[3];
+ mMotionConstraintStiffness = array(stiffness)[2];
+ mSelfCollisionStiffness = array(stiffness)[1];
+
+ logStiffness = simd4f(cloth.mDragLogCoefficient, cloth.mLiftLogCoefficient, 0.0f, 0.0f);
+ stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent);
+ mDragCoefficient = array(stiffness)[0];
+ mLiftCoefficient = array(stiffness)[1];
+ for(int i = 0; i < 9; ++i)
+ mRotation[i] = array(state.mRotationMatrix[i / 3])[i % 3];
+
+ mTargetMotionConstraints = 0;
+ if(!cloth.mMotionConstraints.mStart.empty())
+ {
+ mTargetMotionConstraints = array(*cloth.mMotionConstraints.mStart.begin().get());
+ }
+
+ mStartMotionConstraints = mTargetMotionConstraints;
+ if(!cloth.mMotionConstraints.mTarget.empty())
+ {
+ mTargetMotionConstraints = array(*cloth.mMotionConstraints.mTarget.begin().get());
+ }
+
+ mHostMotionConstraints = array(*getDevicePointer(cloth.mMotionConstraints.mHostCopy));
+
+ mTargetSeparationConstraints = 0;
+ if(!cloth.mSeparationConstraints.mStart.empty())
+ {
+ mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mStart.begin().get());
+ }
+
+ mStartSeparationConstraints = mTargetSeparationConstraints;
+ if(!cloth.mSeparationConstraints.mTarget.empty())
+ {
+ mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mTarget.begin().get());
+ }
+
+ mHostSeparationConstraints = array(*getDevicePointer(cloth.mSeparationConstraints.mHostCopy));
+
+ mParticleAccelerations = 0;
+ if(!cloth.mParticleAccelerations.empty())
+ {
+ mParticleAccelerations = array(*cloth.mParticleAccelerations.begin().get());
+ }
+
+ mHostParticleAccelerations = array(*getDevicePointer(cloth.mParticleAccelerationsHostCopy));
+
+ mRestPositions = 0;
+ if(!cloth.mRestPositions.empty())
+ {
+ mRestPositions = array(*cloth.mRestPositions.begin().get());
+ }
+
+ mStartCollisionSpheres = array(*getDevicePointer(cloth.mStartCollisionSpheres));
+ mTargetCollisionSpheres = array(*getDevicePointer(cloth.mTargetCollisionSpheres));
+
+ if(!mTargetCollisionSpheres)
+ mTargetCollisionSpheres = mStartCollisionSpheres;
+
+ mStartCollisionPlanes = array(*getDevicePointer(cloth.mStartCollisionPlanes));
+ mTargetCollisionPlanes = array(*getDevicePointer(cloth.mTargetCollisionPlanes));
+
+ if(!mTargetCollisionPlanes)
+ mTargetCollisionPlanes = mStartCollisionPlanes;
+
+ mStartCollisionTriangles = array(*getDevicePointer(cloth.mStartCollisionTriangles));
+ mTargetCollisionTriangles = array(*getDevicePointer(cloth.mTargetCollisionTriangles));
+
+ if(!mTargetCollisionTriangles)
+ mTargetCollisionTriangles = mStartCollisionTriangles;
+
+ for(uint32_t i = 0; i < 3; ++i)
+ {
+ float c = cloth.mParticleBoundsCenter[i];
+ float r = cloth.mParticleBoundsHalfExtent[i];
+ mParticleBounds[i * 2 + 0] = r + c;
+ mParticleBounds[i * 2 + 1] = r - c;
+ }
+
+ mSleepPassCounter = cloth.mSleepPassCounter;
+ mSleepTestCounter = cloth.mSleepTestCounter;
+
+ mStiffnessExponent = cloth.mStiffnessFrequency * mIterDt;
+}
+
+namespace
+{
+void copySquareTransposed(float* dst, const float* src)
+{
+ dst[0] = src[0];
+ dst[1] = src[4];
+ dst[2] = src[8];
+ dst[3] = src[1];
+ dst[4] = src[5];
+ dst[5] = src[9];
+ dst[6] = src[2];
+ dst[7] = src[6];
+ dst[8] = src[10];
+}
+}
+
+cloth::CuIterationData::CuIterationData(const IterationState<Simd4f>& state)
+{
+ mIntegrationTrafo[0] = array(state.mPrevBias)[0];
+ mIntegrationTrafo[1] = array(state.mPrevBias)[1];
+ mIntegrationTrafo[2] = array(state.mPrevBias)[2];
+
+ mIntegrationTrafo[3] = array(state.mCurBias)[0];
+ mIntegrationTrafo[4] = array(state.mCurBias)[1];
+ mIntegrationTrafo[5] = array(state.mCurBias)[2];
+
+ copySquareTransposed(mIntegrationTrafo + 6, array(*state.mPrevMatrix));
+ copySquareTransposed(mIntegrationTrafo + 15, array(*state.mCurMatrix));
+
+ mWind[0] = array(state.mWind)[0];
+ mWind[1] = array(state.mWind)[1];
+ mWind[2] = array(state.mWind)[2];
+
+ mIsTurning = state.mIsTurning ? 0x3F800000u : 0; // 1.0f to avoid ftz
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h
new file mode 100644
index 00000000..0be66742
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h
@@ -0,0 +1,191 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#ifndef __CUDACC__
+#include "Simd.h"
+#endif
+
+namespace physx
+{
+namespace cloth
+{
+
+class CuCloth;
+struct CuPhaseConfig;
+template <typename>
+struct IterationState;
+struct IndexPair;
+struct CuIterationData;
+struct CuTether;
+
+// reference to cloth instance bulk data (POD)
+// should not need frequent updates (stored on device)
+struct CuClothData
+{
+ CuClothData()
+ {
+ }
+ CuClothData(CuCloth&);
+
+ // particle data
+ uint32_t mNumParticles;
+ float* mParticles;
+ float* mParticlesHostCopy;
+
+ // fabric constraints
+ uint32_t mNumPhases;
+ const CuPhaseConfig* mPhaseConfigs;
+
+ const CuTether* mTethers;
+ uint32_t mNumTethers;
+ float mTetherConstraintScale;
+
+ const uint16_t* mTriangles;
+ uint32_t mNumTriangles;
+
+ // motion constraint data
+ float mMotionConstraintScale;
+ float mMotionConstraintBias;
+
+ // collision data
+ uint32_t mNumSpheres; // don't change this order, it's
+ uint32_t mNumCapsules; // needed by mergeAcceleration()
+ const IndexPair* mCapsuleIndices;
+ uint32_t mNumPlanes;
+ uint32_t mNumConvexes;
+ const uint32_t* mConvexMasks;
+ uint32_t mNumCollisionTriangles;
+
+ // virtual particle data
+ const uint32_t* mVirtualParticleSetSizesBegin;
+ const uint32_t* mVirtualParticleSetSizesEnd;
+ const uint16_t* mVirtualParticleIndices;
+ const float* mVirtualParticleWeights;
+
+ bool mEnableContinuousCollision;
+ float mCollisionMassScale;
+ float mFrictionScale;
+
+ float mSelfCollisionDistance;
+ uint32_t mNumSelfCollisionIndices;
+ const uint32_t* mSelfCollisionIndices;
+ float* mSelfCollisionParticles;
+ uint32_t* mSelfCollisionKeys;
+ uint16_t* mSelfCollisionCellStart;
+
+ // sleep data
+ uint32_t mSleepTestInterval;
+ uint32_t mSleepAfterCount;
+ float mSleepThreshold;
+};
+
+// per-frame data (stored in pinned memory)
+struct CuFrameData
+{
+ CuFrameData()
+ {
+ } // not initializing pointers to 0!
+
+#ifndef __CUDACC__
+ explicit CuFrameData(CuCloth&, uint32_t, const IterationState<Simd4f>&, const CuIterationData*);
+#endif
+
+ bool mDeviceParticlesDirty;
+
+ // number of particle copies that fit in shared memory (0, 1, or 2)
+ uint32_t mNumSharedPositions;
+
+ // iteration data
+ float mIterDt;
+ uint32_t mNumIterations;
+ const CuIterationData* mIterationData;
+
+ float mTetherConstraintStiffness;
+
+ // wind data
+ float mDragCoefficient;
+ float mLiftCoefficient;
+ float mRotation[9];
+
+ // motion constraint data
+ const float* mStartMotionConstraints;
+ float* mTargetMotionConstraints;
+ const float* mHostMotionConstraints;
+ float mMotionConstraintStiffness;
+
+ // separation constraint data
+ const float* mStartSeparationConstraints;
+ float* mTargetSeparationConstraints;
+ const float* mHostSeparationConstraints;
+
+ // particle acceleration data
+ float* mParticleAccelerations;
+ const float* mHostParticleAccelerations;
+
+ // rest positions
+ const float* mRestPositions;
+
+ // collision data
+ const float* mStartCollisionSpheres;
+ const float* mTargetCollisionSpheres;
+ const float* mStartCollisionPlanes;
+ const float* mTargetCollisionPlanes;
+ const float* mStartCollisionTriangles;
+ const float* mTargetCollisionTriangles;
+
+ float mSelfCollisionStiffness;
+
+ float mParticleBounds[6]; // maxX, -minX, maxY, ...
+
+ uint32_t mSleepPassCounter;
+ uint32_t mSleepTestCounter;
+
+ float mStiffnessExponent;
+};
+
+// per-iteration data (stored in pinned memory)
+struct CuIterationData
+{
+ CuIterationData()
+ {
+ } // not initializing!
+
+#ifndef __CUDACC__
+ explicit CuIterationData(const IterationState<Simd4f>&);
+#endif
+
+ float mIntegrationTrafo[24];
+ float mWind[3];
+ uint32_t mIsTurning;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h
new file mode 100644
index 00000000..cd28a999
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h
@@ -0,0 +1,1505 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifndef CU_SOLVER_KERNEL_CU
+#error include CuCollision.h only from CuSolverKernel.cu
+#endif
+
+#include "IndexPair.h"
+
+namespace
+{
+struct CuCollision
+{
+ struct ShapeMask
+ {
+ uint32_t mSpheres;
+ uint32_t mCones;
+
+ __device__ friend ShapeMask& operator&=(ShapeMask& left, const ShapeMask& right)
+ {
+ left.mSpheres = left.mSpheres & right.mSpheres;
+ left.mCones = left.mCones & right.mCones;
+ return left;
+ }
+ };
+
+ struct CollisionData
+ {
+ Pointer<Shared, float> mSphereX;
+ Pointer<Shared, float> mSphereY;
+ Pointer<Shared, float> mSphereZ;
+ Pointer<Shared, float> mSphereW;
+
+ Pointer<Shared, float> mConeCenterX;
+ Pointer<Shared, float> mConeCenterY;
+ Pointer<Shared, float> mConeCenterZ;
+ Pointer<Shared, float> mConeRadius;
+ Pointer<Shared, float> mConeAxisX;
+ Pointer<Shared, float> mConeAxisY;
+ Pointer<Shared, float> mConeAxisZ;
+ Pointer<Shared, float> mConeSlope;
+ Pointer<Shared, float> mConeSqrCosine;
+ Pointer<Shared, float> mConeHalfLength;
+ };
+
+ public:
+ __device__ CuCollision(Pointer<Shared, uint32_t>);
+
+ template <typename CurrentT, typename PreviousT>
+ __device__ void operator()(CurrentT& current, PreviousT& previous, float alpha);
+
+ private:
+ __device__ void buildSphereAcceleration(const CollisionData&);
+ __device__ void buildConeAcceleration();
+ __device__ void mergeAcceleration();
+
+ template <typename CurrentT>
+ __device__ bool buildAcceleration(const CurrentT&, float);
+
+ __device__ static ShapeMask readShapeMask(const float&, Pointer<Shared, const uint32_t>);
+ template <typename CurPos>
+ __device__ ShapeMask getShapeMask(const CurPos&) const;
+ template <typename PrevPos, typename CurPos>
+ __device__ ShapeMask getShapeMask(const PrevPos&, const CurPos&) const;
+
+ template <typename CurPos>
+ __device__ int32_t collideCapsules(const CurPos&, float3&, float3&) const;
+ template <typename PrevPos, typename CurPos>
+ __device__ int32_t collideCapsules(const PrevPos&, CurPos&, float3&, float3&) const;
+
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideCapsules(CurrentT& current, PreviousT& previous) const;
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideVirtualCapsules(CurrentT& current, PreviousT& previous) const;
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideContinuousCapsules(CurrentT& current, PreviousT& previous) const;
+
+ template <typename CurrentT, typename PreviousT>
+ __device__ void collideConvexes(CurrentT& current, PreviousT& previous, float alpha);
+ template <typename CurPos>
+ __device__ int32_t collideConvexes(const CurPos&, float3&) const;
+
+ template <typename CurrentT>
+ __device__ void collideTriangles(CurrentT& current, float alpha);
+ template <typename CurrentT>
+ __device__ void collideTriangles(CurrentT& current, int32_t i);
+
+ public:
+ Pointer<Shared, uint32_t> mCapsuleIndices;
+ Pointer<Shared, uint32_t> mCapsuleMasks;
+ Pointer<Shared, uint32_t> mConvexMasks;
+
+ CollisionData mPrevData;
+ CollisionData mCurData;
+
+ // acceleration structure
+ Pointer<Shared, uint32_t> mShapeGrid;
+ float mGridScale[3];
+ float mGridBias[3];
+ static const uint32_t sGridSize = 8;
+};
+
+template <typename T>
+__device__ void swap(T& a, T& b)
+{
+ T c = a;
+ a = b;
+ b = c;
+}
+}
+
+__shared__ uninitialized<CuCollision> gCollideParticles;
+
+namespace
+{
+// initializes one pointer past data!
+__device__ void allocate(CuCollision::CollisionData& data)
+{
+ if(threadIdx.x < 15)
+ {
+ Pointer<Shared, float>* ptr = &data.mSphereX;
+ ptr[threadIdx.x] = *ptr + threadIdx.x * gClothData.mNumCapsules +
+ min(threadIdx.x, 4) * (gClothData.mNumSpheres - gClothData.mNumCapsules);
+ }
+}
+
+__device__ void generateSpheres(CuCollision::CollisionData& data, float alpha)
+{
+ // interpolate spheres and transpose
+ if(threadIdx.x < gClothData.mNumSpheres * 4)
+ {
+ float start = __ldg(gFrameData.mStartCollisionSpheres + threadIdx.x);
+ float target = __ldg(gFrameData.mTargetCollisionSpheres + threadIdx.x);
+ float value = start + (target - start) * alpha;
+ if(threadIdx.x % 4 == 3)
+ value = max(value, 0.0f);
+ int32_t j = threadIdx.x % 4 * gClothData.mNumSpheres + threadIdx.x / 4;
+ data.mSphereX[j] = value;
+ }
+
+ __syncthreads();
+}
+
+__device__ void generateCones(CuCollision::CollisionData& data, Pointer<Shared, const uint32_t> iIt)
+{
+ // generate cones
+ if(threadIdx.x < gClothData.mNumCapsules)
+ {
+ uint32_t firstIndex = iIt[0];
+ uint32_t secondIndex = iIt[1];
+
+ float firstX = data.mSphereX[firstIndex];
+ float firstY = data.mSphereY[firstIndex];
+ float firstZ = data.mSphereZ[firstIndex];
+ float firstW = data.mSphereW[firstIndex];
+
+ float secondX = data.mSphereX[secondIndex];
+ float secondY = data.mSphereY[secondIndex];
+ float secondZ = data.mSphereZ[secondIndex];
+ float secondW = data.mSphereW[secondIndex];
+
+ float axisX = (secondX - firstX) * 0.5f;
+ float axisY = (secondY - firstY) * 0.5f;
+ float axisZ = (secondZ - firstZ) * 0.5f;
+ float axisW = (secondW - firstW) * 0.5f;
+
+ float sqrAxisLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+ float sqrConeLength = sqrAxisLength - axisW * axisW;
+
+ float invAxisLength = rsqrtf(sqrAxisLength);
+ float invConeLength = rsqrtf(sqrConeLength);
+
+ if(sqrConeLength <= 0.0f)
+ invAxisLength = invConeLength = 0.0f;
+
+ float axisLength = sqrAxisLength * invAxisLength;
+
+ data.mConeCenterX[threadIdx.x] = (secondX + firstX) * 0.5f;
+ data.mConeCenterY[threadIdx.x] = (secondY + firstY) * 0.5f;
+ data.mConeCenterZ[threadIdx.x] = (secondZ + firstZ) * 0.5f;
+ data.mConeRadius[threadIdx.x] = (axisW + firstW) * invConeLength * axisLength;
+
+ data.mConeAxisX[threadIdx.x] = axisX * invAxisLength;
+ data.mConeAxisY[threadIdx.x] = axisY * invAxisLength;
+ data.mConeAxisZ[threadIdx.x] = axisZ * invAxisLength;
+ data.mConeSlope[threadIdx.x] = axisW * invConeLength;
+
+ float sine = axisW * invAxisLength;
+ data.mConeSqrCosine[threadIdx.x] = 1 - sine * sine;
+ data.mConeHalfLength[threadIdx.x] = axisLength;
+ }
+
+ __syncthreads();
+}
+}
+
+__device__ CuCollision::CuCollision(Pointer<Shared, uint32_t> scratchPtr)
+{
+ int32_t numCapsules2 = 2 * gClothData.mNumCapsules;
+ int32_t numCapsules4 = 4 * gClothData.mNumCapsules;
+ int32_t numConvexes = gClothData.mNumConvexes;
+
+ if(threadIdx.x < 3)
+ {
+ (&mCapsuleIndices)[threadIdx.x] = scratchPtr + threadIdx.x * numCapsules2;
+ (&mShapeGrid)[-14 * int32_t(threadIdx.x)] = scratchPtr + numCapsules4 + numConvexes;
+ }
+
+ Pointer<Shared, uint32_t> indexPtr = scratchPtr + threadIdx.x;
+ if(threadIdx.x < numCapsules2)
+ {
+ uint32_t index = (&gClothData.mCapsuleIndices->first)[threadIdx.x];
+ *indexPtr = index;
+
+ volatile uint32_t* maskPtr = generic(indexPtr + numCapsules2);
+ *maskPtr = 1u << index;
+ *maskPtr |= maskPtr[-int32_t(threadIdx.x & 1)];
+ }
+ indexPtr += numCapsules4;
+
+ if(threadIdx.x < numConvexes)
+ *indexPtr = gClothData.mConvexMasks[threadIdx.x];
+
+ if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f)
+ {
+ allocate(mPrevData);
+
+ __syncthreads(); // mPrevData raw hazard
+
+ generateSpheres(mPrevData, 0.0f);
+ generateCones(mPrevData, mCapsuleIndices + 2 * threadIdx.x);
+ }
+
+ allocate(mCurData); // also initializes mShapeGrid (!)
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::operator()(CurrentT& current, PreviousT& previous, float alpha)
+{
+ // if(current.w > 0) current.w = previous.w (see SwSolverKernel::computeBounds())
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ if(current(i, 3) > 0.0f)
+ current(i, 3) = previous(i, 3);
+ }
+
+ collideConvexes(current, previous, alpha);
+ collideTriangles(current, alpha);
+
+ if(buildAcceleration(current, alpha))
+ {
+ if(gClothData.mEnableContinuousCollision)
+ collideContinuousCapsules(current, previous);
+ else
+ collideCapsules(current, previous);
+
+ collideVirtualCapsules(current, previous);
+ }
+
+ // sync otherwise first threads overwrite sphere data before
+ // remaining ones have had a chance to use it leading to incorrect
+ // velocity calculation for friction / ccd
+
+ __syncthreads();
+
+ if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f)
+ {
+ // store current collision data for next iteration
+ Pointer<Shared, float> dstIt = mPrevData.mSphereX + threadIdx.x;
+ Pointer<Shared, const float> srcIt = mCurData.mSphereX + threadIdx.x;
+ for(; dstIt < mCurData.mSphereX; dstIt += blockDim.x, srcIt += blockDim.x)
+ *dstIt = *srcIt;
+ }
+
+ // __syncthreads() called in updateSleepState()
+}
+
+// build per-axis mask arrays of spheres on the right/left of grid cell
+__device__ void CuCollision::buildSphereAcceleration(const CollisionData& data)
+{
+ if(threadIdx.x >= 192)
+ return;
+
+ int32_t sphereIdx = threadIdx.x & 31;
+ int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z)
+ int32_t signi = threadIdx.x << 26 & 0x80000000; // sign bit (min or max)
+
+ float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi));
+ float pos = signf * data.mSphereW[sphereIdx] + data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx];
+
+ // use overflow so we can test for non-positive
+ uint32_t index = signi - uint32_t(floorf(pos * mGridScale[axisIdx] + mGridBias[axisIdx]));
+
+ axisIdx += (uint32_t(signi) >> 31) * 3;
+ Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx;
+ // #pragma unroll
+ for(int32_t i = 0; i < sGridSize; ++i, ++index)
+ dst[i] |= __ballot(int32_t(index) <= 0);
+}
+
+// generate cone masks from sphere masks
+__device__ void CuCollision::buildConeAcceleration()
+{
+ if(threadIdx.x >= 192)
+ return;
+
+ int32_t coneIdx = threadIdx.x & 31;
+
+ uint32_t sphereMask =
+ mCurData.mConeRadius[coneIdx] && coneIdx < gClothData.mNumCapsules ? mCapsuleMasks[2 * coneIdx + 1] : 0;
+
+ int32_t offset = threadIdx.x / 32 * sGridSize;
+ Pointer<Shared, uint32_t> src = mShapeGrid + offset;
+ Pointer<Shared, uint32_t> dst = src + 6 * sGridSize;
+
+ // #pragma unroll
+ for(int32_t i = 0; i < sGridSize; ++i)
+ dst[i] |= __ballot(src[i] & sphereMask);
+}
+
+// convert right/left mask arrays into single overlap array
+__device__ void CuCollision::mergeAcceleration()
+{
+ if(threadIdx.x < sGridSize * 12)
+ {
+ Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x;
+ if(!(gClothData.mEnableContinuousCollision || threadIdx.x * 43 & 1024))
+ *dst &= dst[sGridSize * 3]; // above is same as 'threadIdx.x/24 & 1'
+
+ // mask garbage bits from build*Acceleration
+ int32_t shapeIdx = threadIdx.x >= sGridSize * 6; // spheres=0, cones=1
+ *dst &= (1 << (&gClothData.mNumSpheres)[shapeIdx]) - 1;
+ }
+}
+
+namespace
+{
+#if __CUDA_ARCH__ >= 300
+__device__ float mergeBounds(Pointer<Shared, float> buffer)
+{
+ float value = *buffer;
+ value = max(value, __shfl_down(value, 1));
+ value = max(value, __shfl_down(value, 2));
+ value = max(value, __shfl_down(value, 4));
+ value = max(value, __shfl_down(value, 8));
+ return max(value, __shfl_down(value, 16));
+}
+#else
+__device__ float mergeBounds(Pointer<Shared, float> buffer)
+{
+ // ensure that writes to buffer are visible to all threads
+ __threadfence_block();
+
+ volatile float* ptr = generic(buffer);
+ *ptr = max(*ptr, ptr[16]);
+ *ptr = max(*ptr, ptr[8]);
+ *ptr = max(*ptr, ptr[4]);
+ *ptr = max(*ptr, ptr[2]);
+ return max(*ptr, ptr[1]);
+}
+#endif
+// computes maxX, -minX, maxY, ... with a stride of 32, threadIdx.x must be < 192
+__device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer)
+{
+ assert(threadIdx.x < 192);
+
+ int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index
+ int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z)
+ int32_t signi = threadIdx.x << 26; // sign bit (min or max)
+ float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi));
+
+ *buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx];
+
+ return mergeBounds(buffer);
+}
+
+#if __CUDA_ARCH__ >= 300
+template <typename CurrentT>
+__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer)
+{
+ int32_t numThreadsPerAxis = blockDim.x * 342 >> 10 & ~31; // same as / 3
+ int32_t axis = (threadIdx.x >= numThreadsPerAxis) + (threadIdx.x >= 2 * numThreadsPerAxis);
+ int32_t threadIdxInAxis = threadIdx.x - axis * numThreadsPerAxis;
+ int laneIdx = threadIdx.x & 31;
+
+ if(threadIdxInAxis < numThreadsPerAxis)
+ {
+ typename CurrentT::ConstPointerType posIt = current[axis];
+ int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1);
+ float minX = posIt[i], maxX = minX;
+ while(i += numThreadsPerAxis, i < gClothData.mNumParticles)
+ {
+ float posX = posIt[i];
+ minX = min(minX, posX);
+ maxX = max(maxX, posX);
+ }
+
+ minX = min(minX, __shfl_down(minX, 1));
+ maxX = max(maxX, __shfl_down(maxX, 1));
+ minX = min(minX, __shfl_down(minX, 2));
+ maxX = max(maxX, __shfl_down(maxX, 2));
+ minX = min(minX, __shfl_down(minX, 4));
+ maxX = max(maxX, __shfl_down(maxX, 4));
+ minX = min(minX, __shfl_down(minX, 8));
+ maxX = max(maxX, __shfl_down(maxX, 8));
+ minX = min(minX, __shfl_down(minX, 16));
+ maxX = max(maxX, __shfl_down(maxX, 16));
+
+ if(!laneIdx)
+ {
+ Pointer<Shared, float> dst = buffer - threadIdx.x + (threadIdxInAxis >> 5) + (axis << 6);
+ dst[0] = maxX;
+ dst[32] = -minX;
+ }
+ }
+
+ __syncthreads();
+
+ if(threadIdx.x >= 192)
+ return 0.0f;
+
+ float value = *buffer;
+ if(laneIdx >= (numThreadsPerAxis >> 5))
+ value = -FLT_MAX;
+
+ // blockDim.x <= 3*512, increase to 3*1024 by adding a shfl by 16
+ assert(numThreadsPerAxis <= 16 * 32);
+
+ value = max(value, __shfl_down(value, 1));
+ value = max(value, __shfl_down(value, 2));
+ value = max(value, __shfl_down(value, 4));
+ return max(value, __shfl_down(value, 8));
+}
+#else
+template <typename CurrentT>
+__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer)
+{
+ if(threadIdx.x >= 192)
+ return 0.0f;
+
+ int32_t axisIdx = threadIdx.x >> 6; // x, y, or z
+ int32_t signi = threadIdx.x << 26; // sign bit (min or max)
+ float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi));
+
+ typename CurrentT::ConstPointerType pIt = current[axisIdx];
+ typename CurrentT::ConstPointerType pEnd = pIt + gClothData.mNumParticles;
+ pIt += min(threadIdx.x & 31, gClothData.mNumParticles - 1);
+
+ *buffer = *pIt * signf;
+ while(pIt += 32, pIt < pEnd)
+ *buffer = max(*buffer, *pIt * signf);
+
+ return mergeBounds(buffer);
+}
+#endif
+}
+
+// build mask of spheres/cones touching a regular grid along each axis
+template <typename CurrentT>
+__device__ bool CuCollision::buildAcceleration(const CurrentT& current, float alpha)
+{
+ // use still unused cone data as buffer for bounds computation
+ Pointer<Shared, float> buffer = mCurData.mConeCenterX + threadIdx.x;
+ float curParticleBounds = computeParticleBounds(current, buffer);
+ int32_t warpIdx = threadIdx.x >> 5;
+
+ if(!gClothData.mNumSpheres)
+ {
+ if(threadIdx.x < 192 && !(threadIdx.x & 31))
+ gFrameData.mParticleBounds[warpIdx] = curParticleBounds;
+ return false;
+ }
+
+ generateSpheres(mCurData, alpha);
+
+ if(threadIdx.x < 192)
+ {
+ float sphereBounds = computeSphereBounds(mCurData, buffer);
+ float particleBounds = curParticleBounds;
+ if(gClothData.mEnableContinuousCollision)
+ {
+ sphereBounds = max(sphereBounds, computeSphereBounds(mPrevData, buffer));
+ float prevParticleBounds = gFrameData.mParticleBounds[warpIdx];
+ particleBounds = max(particleBounds, prevParticleBounds);
+ }
+
+ float bounds = min(sphereBounds, particleBounds);
+ float expandedBounds = bounds + abs(bounds) * 1e-4f;
+
+ // store bounds data in shared memory
+ if(!(threadIdx.x & 31))
+ {
+ mGridScale[warpIdx] = expandedBounds;
+ gFrameData.mParticleBounds[warpIdx] = curParticleBounds;
+ }
+ }
+
+ __syncthreads(); // mGridScale raw hazard
+
+ if(threadIdx.x < 3)
+ {
+ float negativeLower = mGridScale[threadIdx.x * 2 + 1];
+ float edgeLength = mGridScale[threadIdx.x * 2] + negativeLower;
+ float divisor = max(edgeLength, FLT_EPSILON);
+ mGridScale[threadIdx.x] = __fdividef(sGridSize - 1e-3, divisor);
+ mGridBias[threadIdx.x] = negativeLower * mGridScale[threadIdx.x];
+ if(edgeLength < 0.0f)
+ mGridScale[0] = 0.0f; // mark empty intersection
+ }
+
+ // initialize sphere *and* cone grid to 0
+ if(threadIdx.x < 2 * 6 * sGridSize)
+ mShapeGrid[threadIdx.x] = 0;
+
+ __syncthreads(); // mGridScale raw hazard
+
+ // generate cones even if test below fails because
+ // continuous collision might need it in next iteration
+ generateCones(mCurData, mCapsuleIndices + 2 * threadIdx.x);
+
+ if(mGridScale[0] == 0.0f)
+ return false; // early out for empty intersection
+
+ if(gClothData.mEnableContinuousCollision)
+ buildSphereAcceleration(mPrevData);
+ buildSphereAcceleration(mCurData);
+ __syncthreads(); // mCurData raw hazard
+
+ buildConeAcceleration();
+ __syncthreads(); // mShapeGrid raw hazard
+
+ mergeAcceleration();
+ __syncthreads(); // mShapeGrid raw hazard
+
+ return true;
+}
+
+__device__ CuCollision::ShapeMask CuCollision::readShapeMask(const float& position,
+ Pointer<Shared, const uint32_t> sphereGrid)
+{
+ ShapeMask result;
+ int32_t index = int32_t(floorf(position));
+ uint32_t outMask = (index < sGridSize) - 1;
+
+ Pointer<Shared, const uint32_t> gridPtr = sphereGrid + (index & sGridSize - 1);
+ result.mSpheres = gridPtr[0] & ~outMask;
+ result.mCones = gridPtr[sGridSize * 6] & ~outMask;
+
+ return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename CurPos>
+__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const CurPos& positions) const
+{
+ ShapeMask result;
+
+ result = readShapeMask(positions.x * mGridScale[0] + mGridBias[0], mShapeGrid);
+ result &= readShapeMask(positions.y * mGridScale[1] + mGridBias[1], mShapeGrid + 8);
+ result &= readShapeMask(positions.z * mGridScale[2] + mGridBias[2], mShapeGrid + 16);
+
+ return result;
+}
+
+template <typename PrevPos, typename CurPos>
+__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const PrevPos& prevPos, const CurPos& curPos) const
+{
+ ShapeMask result;
+
+ float prevX = prevPos.x * mGridScale[0] + mGridBias[0];
+ float prevY = prevPos.y * mGridScale[1] + mGridBias[1];
+ float prevZ = prevPos.z * mGridScale[2] + mGridBias[2];
+
+ float curX = curPos.x * mGridScale[0] + mGridBias[0];
+ float curY = curPos.y * mGridScale[1] + mGridBias[1];
+ float curZ = curPos.z * mGridScale[2] + mGridBias[2];
+
+ float maxX = min(max(prevX, curX), 7.0f);
+ float maxY = min(max(prevY, curY), 7.0f);
+ float maxZ = min(max(prevZ, curZ), 7.0f);
+
+ result = readShapeMask(maxX, mShapeGrid);
+ result &= readShapeMask(maxY, mShapeGrid + 8);
+ result &= readShapeMask(maxZ, mShapeGrid + 16);
+
+ float minX = max(min(prevX, curX), 0.0f);
+ float minY = max(min(prevY, curY), 0.0f);
+ float minZ = max(min(prevZ, curZ), 0.0f);
+
+ result &= readShapeMask(minX, mShapeGrid + 24);
+ result &= readShapeMask(minY, mShapeGrid + 32);
+ result &= readShapeMask(minZ, mShapeGrid + 40);
+
+ return result;
+}
+
+template <typename CurPos>
+__device__ int32_t CuCollision::collideCapsules(const CurPos& positions, float3& delta, float3& velocity) const
+{
+ ShapeMask shapeMask = getShapeMask(positions);
+
+ delta.x = delta.y = delta.z = 0.0f;
+ velocity.x = velocity.y = velocity.z = 0.0f;
+
+ int32_t numCollisions = 0;
+
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+
+ // cone collision
+ for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1)
+ {
+ int32_t j = __ffs(shapeMask.mCones) - 1;
+
+ float deltaX = positions.x - mCurData.mConeCenterX[j];
+ float deltaY = positions.y - mCurData.mConeCenterY[j];
+ float deltaZ = positions.z - mCurData.mConeCenterZ[j];
+
+ float axisX = mCurData.mConeAxisX[j];
+ float axisY = mCurData.mConeAxisY[j];
+ float axisZ = mCurData.mConeAxisZ[j];
+ float slope = mCurData.mConeSlope[j];
+
+ float dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+ float radius = max(dot * slope + mCurData.mConeRadius[j], 0.0f);
+ float sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
+
+ Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j;
+ uint32_t bothMask = mIt[1];
+
+ if(sqrDistance > radius * radius)
+ {
+ shapeMask.mSpheres &= ~bothMask;
+ continue;
+ }
+
+ sqrDistance = max(sqrDistance, FLT_EPSILON);
+ float invDistance = rsqrtf(sqrDistance);
+
+ float base = dot + slope * sqrDistance * invDistance;
+
+ float halfLength = mCurData.mConeHalfLength[j];
+ uint32_t leftMask = base < -halfLength;
+ uint32_t rightMask = base > halfLength;
+
+ uint32_t firstMask = mIt[0];
+ uint32_t secondMask = firstMask ^ bothMask;
+
+ shapeMask.mSpheres &= ~(firstMask & leftMask - 1);
+ shapeMask.mSpheres &= ~(secondMask & rightMask - 1);
+
+ if(!leftMask && !rightMask)
+ {
+ deltaX = deltaX - base * axisX;
+ deltaY = deltaY - base * axisY;
+ deltaZ = deltaZ - base * axisZ;
+
+ float sqrCosine = mCurData.mConeSqrCosine[j];
+ float scale = radius * invDistance * sqrCosine - sqrCosine;
+
+ delta.x = delta.x + deltaX * scale;
+ delta.y = delta.y + deltaY * scale;
+ delta.z = delta.z + deltaZ * scale;
+
+ if(frictionEnabled)
+ {
+ int32_t s0 = mCapsuleIndices[2 * j];
+ int32_t s1 = mCapsuleIndices[2 * j + 1];
+
+ // load previous sphere pos
+ float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0];
+ float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0];
+ float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0];
+
+ float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1];
+ float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1];
+ float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1];
+
+ // interpolate velocity between the two spheres
+ float t = dot * 0.5f + 0.5f;
+
+ velocity.x += s0vx + t * (s1vx - s0vx);
+ velocity.y += s0vy + t * (s1vy - s0vy);
+ velocity.z += s0vz + t * (s1vz - s0vz);
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ // sphere collision
+ for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1)
+ {
+ int32_t j = __ffs(shapeMask.mSpheres) - 1;
+
+ float deltaX = positions.x - mCurData.mSphereX[j];
+ float deltaY = positions.y - mCurData.mSphereY[j];
+ float deltaZ = positions.z - mCurData.mSphereZ[j];
+
+ float sqrDistance = FLT_EPSILON + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+ float relDistance = rsqrtf(sqrDistance) * mCurData.mSphereW[j];
+
+ if(relDistance > 1.0f)
+ {
+ float scale = relDistance - 1.0f;
+
+ delta.x = delta.x + deltaX * scale;
+ delta.y = delta.y + deltaY * scale;
+ delta.z = delta.z + deltaZ * scale;
+
+ if(frictionEnabled)
+ {
+ velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j];
+ velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j];
+ velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j];
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ return numCollisions;
+}
+
+static const __device__ float gSkeletonWidth = (1 - 0.2f) * (1 - 0.2f) - 1;
+
+template <typename PrevPos, typename CurPos>
+__device__ int32_t
+CuCollision::collideCapsules(const PrevPos& prevPos, CurPos& curPos, float3& delta, float3& velocity) const
+{
+ ShapeMask shapeMask = getShapeMask(prevPos, curPos);
+
+ delta.x = delta.y = delta.z = 0.0f;
+ velocity.x = velocity.y = velocity.z = 0.0f;
+
+ int32_t numCollisions = 0;
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+
+ // cone collision
+ for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1)
+ {
+ int32_t j = __ffs(shapeMask.mCones) - 1;
+
+ float prevAxisX = mPrevData.mConeAxisX[j];
+ float prevAxisY = mPrevData.mConeAxisY[j];
+ float prevAxisZ = mPrevData.mConeAxisZ[j];
+ float prevSlope = mPrevData.mConeSlope[j];
+
+ float prevX = prevPos.x - mPrevData.mConeCenterX[j];
+ float prevY = prevPos.y - mPrevData.mConeCenterY[j];
+ float prevZ = prevPos.z - mPrevData.mConeCenterZ[j];
+ float prevT = prevY * prevAxisZ - prevZ * prevAxisY;
+ float prevU = prevZ * prevAxisX - prevX * prevAxisZ;
+ float prevV = prevX * prevAxisY - prevY * prevAxisX;
+ float prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+ float prevRadius = max(prevDot * prevSlope + mCurData.mConeRadius[j], 0.0f);
+
+ float curAxisX = mCurData.mConeAxisX[j];
+ float curAxisY = mCurData.mConeAxisY[j];
+ float curAxisZ = mCurData.mConeAxisZ[j];
+ float curSlope = mCurData.mConeSlope[j];
+
+ float curX = curPos.x - mCurData.mConeCenterX[j];
+ float curY = curPos.y - mCurData.mConeCenterY[j];
+ float curZ = curPos.z - mCurData.mConeCenterZ[j];
+ float curT = curY * curAxisZ - curZ * curAxisY;
+ float curU = curZ * curAxisX - curX * curAxisZ;
+ float curV = curX * curAxisY - curY * curAxisX;
+ float curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+ float curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f);
+
+ float curSqrDistance = FLT_EPSILON + curT * curT + curU * curU + curV * curV;
+
+ float dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
+ float dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
+ float dotCurCur = curSqrDistance - curRadius * curRadius;
+
+ float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+ float sqrtD = sqrtf(discriminant);
+ float halfB = dotPrevCur - dotPrevPrev;
+ float minusA = dotPrevCur - dotCurCur + halfB;
+
+ // time of impact or 0 if prevPos inside cone
+ float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA);
+ bool hasCollision = toi < 1.0f && halfB < sqrtD;
+
+ // skip continuous collision if the (un-clamped) particle
+ // trajectory only touches the outer skin of the cone.
+ float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+ hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth);
+
+ // a is negative when one cone is contained in the other,
+ // which is already handled by discrete collision.
+ hasCollision = hasCollision && minusA < -FLT_EPSILON;
+
+ if(hasCollision)
+ {
+ float deltaX = prevX - curX;
+ float deltaY = prevY - curY;
+ float deltaZ = prevZ - curZ;
+
+ // interpolate delta at toi
+ float posX = prevX - deltaX * toi;
+ float posY = prevY - deltaY * toi;
+ float posZ = prevZ - deltaZ * toi;
+
+ float curHalfLength = mCurData.mConeHalfLength[j];
+ float curScaledAxisX = curAxisX * curHalfLength;
+ float curScaledAxisY = curAxisY * curHalfLength;
+ float curScaledAxisZ = curAxisZ * curHalfLength;
+
+ float prevHalfLength = mPrevData.mConeHalfLength[j];
+ float deltaScaledAxisX = curScaledAxisX - prevAxisX * prevHalfLength;
+ float deltaScaledAxisY = curScaledAxisY - prevAxisY * prevHalfLength;
+ float deltaScaledAxisZ = curScaledAxisZ - prevAxisZ * prevHalfLength;
+
+ float oneMinusToi = 1.0f - toi;
+
+ // interpolate axis at toi
+ float axisX = curScaledAxisX - deltaScaledAxisX * oneMinusToi;
+ float axisY = curScaledAxisY - deltaScaledAxisY * oneMinusToi;
+ float axisZ = curScaledAxisZ - deltaScaledAxisZ * oneMinusToi;
+ float slope = prevSlope * oneMinusToi + curSlope * toi;
+
+ float sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+ float invHalfLength = rsqrtf(sqrHalfLength);
+ float dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
+
+ float sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
+ float invDistance = sqrDistance > 0.0f ? rsqrtf(sqrDistance) : 0.0f;
+
+ float base = dot + slope * sqrDistance * invDistance;
+ float scale = base * invHalfLength;
+
+ if(abs(scale) < 1.0f)
+ {
+ deltaX = deltaX + deltaScaledAxisX * scale;
+ deltaY = deltaY + deltaScaledAxisY * scale;
+ deltaZ = deltaZ + deltaScaledAxisZ * scale;
+
+ // reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+ // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+ float minusK = __fdividef(sqrtD, minusA * oneMinusToi);
+ oneMinusToi = __fdividef(oneMinusToi, 1 - minusK);
+
+ curX = curX + deltaX * oneMinusToi;
+ curY = curY + deltaY * oneMinusToi;
+ curZ = curZ + deltaZ * oneMinusToi;
+
+ curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+ curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f);
+ curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
+
+ curPos.x = mCurData.mConeCenterX[j] + curX;
+ curPos.y = mCurData.mConeCenterY[j] + curY;
+ curPos.z = mCurData.mConeCenterZ[j] + curZ;
+ }
+ }
+
+ // curPos inside cone (discrete collision)
+ bool hasContact = curRadius * curRadius > curSqrDistance;
+
+ Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j;
+ uint32_t bothMask = mIt[1];
+
+ uint32_t cullMask = bothMask & (hasCollision | hasContact) - 1;
+ shapeMask.mSpheres &= ~cullMask;
+
+ if(!hasContact)
+ continue;
+
+ float invDistance = curSqrDistance > 0.0f ? rsqrtf(curSqrDistance) : 0.0f;
+ float base = curDot + curSlope * curSqrDistance * invDistance;
+
+ float halfLength = mCurData.mConeHalfLength[j];
+ uint32_t leftMask = base < -halfLength;
+ uint32_t rightMask = base > halfLength;
+
+ // can only skip continuous sphere collision if post-ccd position
+ // is on code side *and* particle had cone-ccd collision.
+ uint32_t firstMask = mIt[0];
+ uint32_t secondMask = firstMask ^ bothMask;
+ cullMask = (firstMask & leftMask - 1) | (secondMask & rightMask - 1);
+ shapeMask.mSpheres &= ~cullMask | hasCollision - 1;
+
+ if(!leftMask && !rightMask)
+ {
+ float deltaX = curX - base * curAxisX;
+ float deltaY = curY - base * curAxisY;
+ float deltaZ = curZ - base * curAxisZ;
+
+ float sqrCosine = mCurData.mConeSqrCosine[j];
+ float scale = curRadius * invDistance * sqrCosine - sqrCosine;
+
+ delta.x = delta.x + deltaX * scale;
+ delta.y = delta.y + deltaY * scale;
+ delta.z = delta.z + deltaZ * scale;
+
+ if(frictionEnabled)
+ {
+ int32_t s0 = mCapsuleIndices[2 * j];
+ int32_t s1 = mCapsuleIndices[2 * j + 1];
+
+ // load previous sphere pos
+ float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0];
+ float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0];
+ float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0];
+
+ float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1];
+ float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1];
+ float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1];
+
+ // interpolate velocity between the two spheres
+ float t = curDot * 0.5f + 0.5f;
+
+ velocity.x += s0vx + t * (s1vx - s0vx);
+ velocity.y += s0vy + t * (s1vy - s0vy);
+ velocity.z += s0vz + t * (s1vz - s0vz);
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ // sphere collision
+ for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1)
+ {
+ int32_t j = __ffs(shapeMask.mSpheres) - 1;
+
+ float prevX = prevPos.x - mPrevData.mSphereX[j];
+ float prevY = prevPos.y - mPrevData.mSphereY[j];
+ float prevZ = prevPos.z - mPrevData.mSphereZ[j];
+ float prevRadius = mPrevData.mSphereW[j];
+
+ float curX = curPos.x - mCurData.mSphereX[j];
+ float curY = curPos.y - mCurData.mSphereY[j];
+ float curZ = curPos.z - mCurData.mSphereZ[j];
+ float curRadius = mCurData.mSphereW[j];
+
+ float sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ;
+
+ float dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius;
+ float dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius;
+ float dotCurCur = sqrDistance - curRadius * curRadius;
+
+ float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+ float sqrtD = sqrtf(discriminant);
+ float halfB = dotPrevCur - dotPrevPrev;
+ float minusA = dotPrevCur - dotCurCur + halfB;
+
+ // time of impact or 0 if prevPos inside sphere
+ float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA);
+ bool hasCollision = toi < 1.0f && halfB < sqrtD;
+
+ // skip continuous collision if the (un-clamped) particle
+ // trajectory only touches the outer skin of the cone.
+ float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+ hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth);
+
+ // a is negative when one cone is contained in the other,
+ // which is already handled by discrete collision.
+ hasCollision = hasCollision && minusA < -FLT_EPSILON;
+
+ if(hasCollision)
+ {
+ float deltaX = prevX - curX;
+ float deltaY = prevY - curY;
+ float deltaZ = prevZ - curZ;
+
+ float oneMinusToi = 1.0f - toi;
+
+ // reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+ // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+ float minusK = __fdividef(sqrtD, minusA * oneMinusToi);
+ oneMinusToi = __fdividef(oneMinusToi, 1 - minusK);
+
+ curX = curX + deltaX * oneMinusToi;
+ curY = curY + deltaY * oneMinusToi;
+ curZ = curZ + deltaZ * oneMinusToi;
+
+ curPos.x = mCurData.mSphereX[j] + curX;
+ curPos.y = mCurData.mSphereY[j] + curY;
+ curPos.z = mCurData.mSphereZ[j] + curZ;
+
+ sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ;
+ }
+
+ float relDistance = rsqrtf(sqrDistance) * curRadius;
+
+ if(relDistance > 1.0f)
+ {
+ float scale = relDistance - 1.0f;
+
+ delta.x = delta.x + curX * scale;
+ delta.y = delta.y + curY * scale;
+ delta.z = delta.z + curZ * scale;
+
+ if(frictionEnabled)
+ {
+ velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j];
+ velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j];
+ velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j];
+ }
+
+ ++numCollisions;
+ }
+ }
+
+ return numCollisions;
+}
+
+namespace
+{
+template <typename PrevPos, typename CurPos>
+__device__ inline float3 calcFrictionImpulse(const PrevPos& prevPos, const CurPos& curPos, const float3& shapeVelocity,
+ float scale, const float3& collisionImpulse)
+{
+ const float frictionScale = gClothData.mFrictionScale;
+
+ // calculate collision normal
+ float deltaSq = collisionImpulse.x * collisionImpulse.x + collisionImpulse.y * collisionImpulse.y +
+ collisionImpulse.z * collisionImpulse.z;
+
+ float rcpDelta = rsqrtf(deltaSq + FLT_EPSILON);
+
+ float nx = collisionImpulse.x * rcpDelta;
+ float ny = collisionImpulse.y * rcpDelta;
+ float nz = collisionImpulse.z * rcpDelta;
+
+ // calculate relative velocity scaled by number of collision
+ float rvx = curPos.x - prevPos.x - shapeVelocity.x * scale;
+ float rvy = curPos.y - prevPos.y - shapeVelocity.y * scale;
+ float rvz = curPos.z - prevPos.z - shapeVelocity.z * scale;
+
+ // calculate magnitude of relative normal velocity
+ float rvn = rvx * nx + rvy * ny + rvz * nz;
+
+ // calculate relative tangential velocity
+ float rvtx = rvx - rvn * nx;
+ float rvty = rvy - rvn * ny;
+ float rvtz = rvz - rvn * nz;
+
+ // calculate magnitude of vt
+ float rcpVt = rsqrtf(rvtx * rvtx + rvty * rvty + rvtz * rvtz + FLT_EPSILON);
+
+ // magnitude of friction impulse (cannot be larger than -|vt|)
+ float j = max(-frictionScale * deltaSq * rcpDelta * scale * rcpVt, -1.0f);
+
+ return make_float3(rvtx * j, rvty * j, rvtz * j);
+}
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideCapsules(CurrentT& current, PreviousT& previous) const
+{
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+ bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ typename CurrentT::VectorType curPos = current(i);
+
+ float3 delta, velocity;
+ if(int32_t numCollisions = collideCapsules(curPos, delta, velocity))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+
+ if(frictionEnabled)
+ {
+ typename PreviousT::VectorType prevPos = previous(i);
+ float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta);
+
+ prevPos.x -= frictionImpulse.x;
+ prevPos.y -= frictionImpulse.y;
+ prevPos.z -= frictionImpulse.z;
+
+ previous(i) = prevPos;
+ }
+
+ curPos.x += delta.x * scale;
+ curPos.y += delta.y * scale;
+ curPos.z += delta.z * scale;
+
+ current(i) = curPos;
+
+ if(massScaleEnabled)
+ {
+ float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
+ float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq;
+ current(i, 3) = __fdividef(current(i, 3), massScale);
+ }
+ }
+ }
+}
+
+namespace
+{
+template <typename PointerT>
+__device__ float lerp(PointerT pos, const int4& indices, const float4& weights)
+{
+ return pos[indices.x] * weights.x + pos[indices.y] * weights.y + pos[indices.z] * weights.z;
+}
+
+template <typename PointerT>
+__device__ void apply(PointerT pos, const int4& indices, const float4& weights, float delta)
+{
+ pos[indices.x] += delta * weights.x;
+ pos[indices.y] += delta * weights.y;
+ pos[indices.z] += delta * weights.z;
+}
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideVirtualCapsules(CurrentT& current, PreviousT& previous) const
+{
+ const uint32_t* __restrict setSizeIt = gClothData.mVirtualParticleSetSizesBegin;
+
+ if(!setSizeIt)
+ return;
+
+ if(gClothData.mEnableContinuousCollision)
+ {
+ // copied from mergeAcceleration
+ Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x;
+ if(!(threadIdx.x * 43 & 1024) && threadIdx.x < sGridSize * 12)
+ *dst &= dst[sGridSize * 3];
+ __syncthreads(); // mShapeGrid raw hazard
+ }
+
+ const uint32_t* __restrict setSizeEnd = gClothData.mVirtualParticleSetSizesEnd;
+ const uint16_t* __restrict indicesEnd = gClothData.mVirtualParticleIndices;
+ const float4* __restrict weightsIt = reinterpret_cast<const float4*>(gClothData.mVirtualParticleWeights);
+
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+ bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+ for(; setSizeIt != setSizeEnd; ++setSizeIt)
+ {
+ __syncthreads();
+
+ const uint16_t* __restrict indicesIt = indicesEnd + threadIdx.x * 4;
+ for(indicesEnd += *setSizeIt * 4; indicesIt < indicesEnd; indicesIt += blockDim.x * 4)
+ {
+ int4 indices = make_int4(indicesIt[0], indicesIt[1], indicesIt[2], indicesIt[3]);
+
+ float4 weights = weightsIt[indices.w];
+
+ float3 curPos;
+ curPos.x = lerp(current[0], indices, weights);
+ curPos.y = lerp(current[1], indices, weights);
+ curPos.z = lerp(current[2], indices, weights);
+
+ float3 delta, velocity;
+ if(int32_t numCollisions = collideCapsules(curPos, delta, velocity))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+ float wscale = weights.w * scale;
+
+ apply(current[0], indices, weights, delta.x * wscale);
+ apply(current[1], indices, weights, delta.y * wscale);
+ apply(current[2], indices, weights, delta.z * wscale);
+
+ if(frictionEnabled)
+ {
+ float3 prevPos;
+ prevPos.x = lerp(previous[0], indices, weights);
+ prevPos.y = lerp(previous[1], indices, weights);
+ prevPos.z = lerp(previous[2], indices, weights);
+
+ float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta);
+
+ apply(previous[0], indices, weights, frictionImpulse.x * -weights.w);
+ apply(previous[1], indices, weights, frictionImpulse.y * -weights.w);
+ apply(previous[2], indices, weights, frictionImpulse.z * -weights.w);
+ }
+
+ if(massScaleEnabled)
+ {
+ float deltaLengthSq = (delta.x * delta.x + delta.y * delta.y + delta.z * delta.z) * scale * scale;
+ float invMassScale = __fdividef(1.0f, 1.0f + gClothData.mCollisionMassScale * deltaLengthSq);
+
+ // not multiplying by weights[3] here because unlike applying velocity
+ // deltas where we want the interpolated position to obtain a particular
+ // value, we instead just require that the total change is equal to invMassScale
+ invMassScale = invMassScale - 1.0f;
+ current(indices.x, 3) *= 1.0f + weights.x * invMassScale;
+ current(indices.y, 3) *= 1.0f + weights.y * invMassScale;
+ current(indices.z, 3) *= 1.0f + weights.z * invMassScale;
+ }
+ }
+ }
+ }
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideContinuousCapsules(CurrentT& current, PreviousT& previous) const
+{
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+ bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ typename PreviousT::VectorType prevPos = previous(i);
+ typename CurrentT::VectorType curPos = current(i);
+
+ float3 delta, velocity;
+ if(int32_t numCollisions = collideCapsules(prevPos, curPos, delta, velocity))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+
+ if(frictionEnabled)
+ {
+ float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta);
+
+ prevPos.x -= frictionImpulse.x;
+ prevPos.y -= frictionImpulse.y;
+ prevPos.z -= frictionImpulse.z;
+
+ previous(i) = prevPos;
+ }
+
+ curPos.x += delta.x * scale;
+ curPos.y += delta.y * scale;
+ curPos.z += delta.z * scale;
+
+ current(i) = curPos;
+
+ if(massScaleEnabled)
+ {
+ float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z;
+ float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq;
+ current(i, 3) = __fdividef(current(i, 3), massScale);
+ }
+ }
+ }
+}
+
+template <typename CurPos>
+__device__ int32_t CuCollision::collideConvexes(const CurPos& positions, float3& delta) const
+{
+ delta.x = delta.y = delta.z = 0.0f;
+
+ Pointer<Shared, const float> planeX = mCurData.mSphereX;
+ Pointer<Shared, const float> planeY = planeX + gClothData.mNumPlanes;
+ Pointer<Shared, const float> planeZ = planeY + gClothData.mNumPlanes;
+ Pointer<Shared, const float> planeW = planeZ + gClothData.mNumPlanes;
+
+ int32_t numCollisions = 0;
+ Pointer<Shared, const uint32_t> cIt = mConvexMasks;
+ Pointer<Shared, const uint32_t> cEnd = cIt + gClothData.mNumConvexes;
+ for(; cIt != cEnd; ++cIt)
+ {
+ uint32_t mask = *cIt;
+
+ int32_t maxIndex = __ffs(mask) - 1;
+ float maxDist = planeW[maxIndex] + positions.z * planeZ[maxIndex] + positions.y * planeY[maxIndex] +
+ positions.x * planeX[maxIndex];
+
+ while((maxDist < 0.0f) && (mask &= mask - 1))
+ {
+ int32_t i = __ffs(mask) - 1;
+ float dist = planeW[i] + positions.z * planeZ[i] + positions.y * planeY[i] + positions.x * planeX[i];
+ if(dist > maxDist)
+ maxDist = dist, maxIndex = i;
+ }
+
+ if(maxDist < 0.0f)
+ {
+ delta.x -= planeX[maxIndex] * maxDist;
+ delta.y -= planeY[maxIndex] * maxDist;
+ delta.z -= planeZ[maxIndex] * maxDist;
+
+ ++numCollisions;
+ }
+ }
+
+ return numCollisions;
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void CuCollision::collideConvexes(CurrentT& current, PreviousT& previous, float alpha)
+{
+ if(!gClothData.mNumConvexes)
+ return;
+
+ // interpolate planes and transpose
+ if(threadIdx.x < gClothData.mNumPlanes * 4)
+ {
+ float start = gFrameData.mStartCollisionPlanes[threadIdx.x];
+ float target = gFrameData.mTargetCollisionPlanes[threadIdx.x];
+ int32_t j = threadIdx.x % 4 * gClothData.mNumPlanes + threadIdx.x / 4;
+ mCurData.mSphereX[j] = start + (target - start) * alpha;
+ }
+
+ __syncthreads();
+
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ {
+ typename CurrentT::VectorType curPos = current(i);
+
+ float3 delta;
+ if(int32_t numCollisions = collideConvexes(curPos, delta))
+ {
+ float scale = __fdividef(1.0f, numCollisions);
+
+ if(frictionEnabled)
+ {
+ typename PreviousT::VectorType prevPos = previous(i);
+
+ float3 frictionImpulse =
+ calcFrictionImpulse(prevPos, curPos, make_float3(0.0f, 0.0f, 0.0f), scale, delta);
+
+ prevPos.x -= frictionImpulse.x;
+ prevPos.y -= frictionImpulse.y;
+ prevPos.z -= frictionImpulse.z;
+
+ previous(i) = prevPos;
+ }
+
+ curPos.x += delta.x * scale;
+ curPos.y += delta.y * scale;
+ curPos.z += delta.z * scale;
+
+ current(i) = curPos;
+ }
+ }
+
+ __syncthreads();
+}
+
+namespace
+{
+struct TriangleData
+{
+ float baseX, baseY, baseZ;
+ float edge0X, edge0Y, edge0Z;
+ float edge1X, edge1Y, edge1Z;
+ float normalX, normalY, normalZ;
+
+ float edge0DotEdge1;
+ float edge0SqrLength;
+ float edge1SqrLength;
+
+ float det;
+ float denom;
+
+ float edge0InvSqrLength;
+ float edge1InvSqrLength;
+
+ // initialize struct after vertices have been stored in first 9 members
+ __device__ void initialize()
+ {
+ edge0X -= baseX, edge0Y -= baseY, edge0Z -= baseZ;
+ edge1X -= baseX, edge1Y -= baseY, edge1Z -= baseZ;
+
+ normalX = edge0Y * edge1Z - edge0Z * edge1Y;
+ normalY = edge0Z * edge1X - edge0X * edge1Z;
+ normalZ = edge0X * edge1Y - edge0Y * edge1X;
+
+ float normalInvLength = rsqrtf(normalX * normalX + normalY * normalY + normalZ * normalZ);
+ normalX *= normalInvLength;
+ normalY *= normalInvLength;
+ normalZ *= normalInvLength;
+
+ edge0DotEdge1 = edge0X * edge1X + edge0Y * edge1Y + edge0Z * edge1Z;
+ edge0SqrLength = edge0X * edge0X + edge0Y * edge0Y + edge0Z * edge0Z;
+ edge1SqrLength = edge1X * edge1X + edge1Y * edge1Y + edge1Z * edge1Z;
+
+ det = __fdividef(1.0f, edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1);
+ denom = __fdividef(1.0f, edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1);
+
+ edge0InvSqrLength = __fdividef(1.0f, edge0SqrLength);
+ edge1InvSqrLength = __fdividef(1.0f, edge1SqrLength);
+ }
+};
+}
+
+template <typename CurrentT>
+__device__ void CuCollision::collideTriangles(CurrentT& current, int32_t i)
+{
+ float posX = current(i, 0);
+ float posY = current(i, 1);
+ float posZ = current(i, 2);
+
+ const TriangleData* __restrict tIt = reinterpret_cast<const TriangleData*>(generic(mCurData.mSphereX));
+ const TriangleData* __restrict tEnd = tIt + gClothData.mNumCollisionTriangles;
+
+ float normalX, normalY, normalZ, normalD = 0.0f;
+ float minSqrLength = FLT_MAX;
+
+ for(; tIt != tEnd; ++tIt)
+ {
+ float dx = posX - tIt->baseX;
+ float dy = posY - tIt->baseY;
+ float dz = posZ - tIt->baseZ;
+
+ float deltaDotEdge0 = dx * tIt->edge0X + dy * tIt->edge0Y + dz * tIt->edge0Z;
+ float deltaDotEdge1 = dx * tIt->edge1X + dy * tIt->edge1Y + dz * tIt->edge1Z;
+ float deltaDotNormal = dx * tIt->normalX + dy * tIt->normalY + dz * tIt->normalZ;
+
+ float s = tIt->edge1SqrLength * deltaDotEdge0 - tIt->edge0DotEdge1 * deltaDotEdge1;
+ float t = tIt->edge0SqrLength * deltaDotEdge1 - tIt->edge0DotEdge1 * deltaDotEdge0;
+
+ s = t > 0.0f ? s * tIt->det : deltaDotEdge0 * tIt->edge0InvSqrLength;
+ t = s > 0.0f ? t * tIt->det : deltaDotEdge1 * tIt->edge1InvSqrLength;
+
+ if(s + t > 1.0f)
+ {
+ s = (tIt->edge1SqrLength - tIt->edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1) * tIt->denom;
+ }
+
+ s = fmaxf(0.0f, fminf(1.0f, s));
+ t = fmaxf(0.0f, fminf(1.0f - s, t));
+
+ dx = dx - tIt->edge0X * s - tIt->edge1X * t;
+ dy = dy - tIt->edge0Y * s - tIt->edge1Y * t;
+ dz = dz - tIt->edge0Z * s - tIt->edge1Z * t;
+
+ float sqrLength = dx * dx + dy * dy + dz * dz;
+
+ if(0.0f > deltaDotNormal)
+ sqrLength *= 1.0001f;
+
+ if(sqrLength < minSqrLength)
+ {
+ normalX = tIt->normalX;
+ normalY = tIt->normalY;
+ normalZ = tIt->normalZ;
+ normalD = deltaDotNormal;
+ minSqrLength = sqrLength;
+ }
+ }
+
+ if(normalD < 0.0f)
+ {
+ current(i, 0) = posX - normalX * normalD;
+ current(i, 1) = posY - normalY * normalD;
+ current(i, 2) = posZ - normalZ * normalD;
+ }
+}
+
+namespace
+{
+static const int32_t sTrianglePadding = sizeof(TriangleData) / sizeof(float) - 9;
+}
+
+template <typename CurrentT>
+__device__ void CuCollision::collideTriangles(CurrentT& current, float alpha)
+{
+ if(!gClothData.mNumCollisionTriangles)
+ return;
+
+ // interpolate triangle vertices and store in shared memory
+ for(int32_t i = threadIdx.x, n = gClothData.mNumCollisionTriangles * 9; i < n; i += blockDim.x)
+ {
+ float start = gFrameData.mStartCollisionTriangles[i];
+ float target = gFrameData.mTargetCollisionTriangles[i];
+ int32_t idx = i * 7282 >> 16; // same as i/9
+ int32_t offset = i + idx * sTrianglePadding;
+ mCurData.mSphereX[offset] = start + (target - start) * alpha;
+ }
+
+ __syncthreads();
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumCollisionTriangles; i += blockDim.x)
+ {
+ reinterpret_cast<TriangleData*>(generic(mCurData.mSphereX))[i].initialize();
+ }
+
+ __syncthreads();
+
+ for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+ collideTriangles(current, i);
+
+ __syncthreads();
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp
new file mode 100644
index 00000000..2ccc3db9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp
@@ -0,0 +1,54 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "cudamanager/PxCudaContextManager.h"
+#include "CuContextLock.h"
+#include "CuFactory.h"
+
+using namespace physx;
+
+cloth::CuContextLock::CuContextLock(const CuFactory& factory) : mFactory(factory)
+{
+ acquire();
+}
+
+cloth::CuContextLock::~CuContextLock()
+{
+ release();
+}
+
+void cloth::CuContextLock::acquire()
+{
+ mFactory.mContextManager->acquireContext();
+}
+
+void cloth::CuContextLock::release()
+{
+ mFactory.mContextManager->releaseContext();
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h
new file mode 100644
index 00000000..50e48b49
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h
@@ -0,0 +1,57 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class CuFactory;
+
+// acquires cuda context for the lifetime of the instance
+class CuContextLock
+{
+ protected:
+ CuContextLock(const CuContextLock&);
+ CuContextLock& operator=(const CuContextLock&);
+
+ public:
+ CuContextLock(const CuFactory&);
+ ~CuContextLock();
+
+ void acquire();
+ void release();
+
+ const CuFactory& mFactory;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h
new file mode 100644
index 00000000..cb37b39d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h
@@ -0,0 +1,216 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <cuda.h>
+#include "CuCheckSuccess.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+template <typename T>
+struct RemoveConst
+{
+ typedef T Type;
+};
+template <typename T>
+struct RemoveConst<const T>
+{
+ typedef T Type;
+};
+
+template <typename>
+class CuDeviceReference; // forward declare
+
+// pointer to POD type in CUDA device memory
+template <typename T>
+class CuDevicePointer
+{
+ template <typename>
+ friend class CuDevicePointer;
+
+ typedef typename RemoveConst<T>::Type ValueType;
+
+ public:
+ // c'tors
+ CuDevicePointer() : mPointer(0)
+ {
+ }
+ template <class U>
+ explicit CuDevicePointer(U* ptr)
+ : mPointer(ptr)
+ {
+ }
+ CuDevicePointer(const CuDevicePointer<ValueType>& ptr) : mPointer(ptr.get())
+ {
+ }
+
+ // conversion
+ template <typename U>
+ operator CuDevicePointer<U>(void) const
+ {
+ return CuDevicePointer<U>(static_cast<U*>(mPointer));
+ }
+ T* get() const
+ {
+ return mPointer;
+ }
+ CUdeviceptr dev() const
+ {
+ return reinterpret_cast<CUdeviceptr>(mPointer);
+ }
+
+ // operators
+ CuDevicePointer operator+(const ptrdiff_t& rhs) const
+ {
+ return CuDevicePointer(mPointer + rhs);
+ }
+ CuDevicePointer operator-(const ptrdiff_t& rhs) const
+ {
+ return CuDevicePointer(mPointer - rhs);
+ }
+ CuDevicePointer& operator++(void)
+ {
+ ++mPointer;
+ return *this;
+ }
+ CuDevicePointer operator++(int)
+ {
+ CuDevicePointer copy(*this);
+ ++(*this);
+ return copy;
+ }
+ CuDevicePointer& operator--(void)
+ {
+ --mPointer;
+ return *this;
+ }
+ CuDevicePointer operator--(int)
+ {
+ CuDevicePointer copy(*this);
+ --(*this);
+ return copy;
+ }
+ CuDevicePointer& operator+=(ptrdiff_t rhs)
+ {
+ mPointer += rhs;
+ return *this;
+ }
+ CuDevicePointer& operator-=(ptrdiff_t rhs)
+ {
+ mPointer -= rhs;
+ return *this;
+ }
+ ptrdiff_t operator-(const CuDevicePointer& rhs) const
+ {
+ return mPointer - rhs.mPointer;
+ }
+
+ template <typename U>
+ bool operator==(const CuDevicePointer<U>& other) const
+ {
+ return mPointer == other.mPointer;
+ }
+ template <typename U>
+ bool operator!=(const CuDevicePointer<U>& other) const
+ {
+ return mPointer != other.mPointer;
+ }
+
+ // dereference
+ CuDeviceReference<T> operator[](const ptrdiff_t&) const; // (implemented below)
+ CuDeviceReference<T> operator*(void) const
+ {
+ return operator[](0);
+ }
+
+ private:
+ T* mPointer;
+};
+
+template <typename T>
+class CuDeviceReference
+{
+ template <typename>
+ friend class CuDeviceReference;
+ template <typename>
+ friend class CuDevicePointer;
+
+ typedef typename RemoveConst<T>::Type ValueType;
+
+ template <typename U>
+ CuDeviceReference(CuDevicePointer<U> pointer)
+ : mPointer(static_cast<T*>(pointer.get()))
+ {
+ }
+
+ public:
+ template <typename U>
+ CuDeviceReference(CuDeviceReference<U> reference)
+ : mPointer(static_cast<T*>(reference.mPointer))
+ {
+ }
+
+ CuDevicePointer<T> operator&() const
+ {
+ return CuDevicePointer<T>(mPointer);
+ }
+
+ CuDeviceReference& operator=(const T& v)
+ {
+ checkSuccess(cuMemcpyHtoD(CUdeviceptr(mPointer), &v, sizeof(T)));
+ return *this;
+ }
+ CuDeviceReference& operator=(const CuDeviceReference& ref)
+ {
+ checkSuccess(cuMemcpyDtoD(CUdeviceptr(mPointer), CUdeviceptr(ref.mPointer), sizeof(T)));
+ return *this;
+ }
+ operator ValueType() const
+ {
+ ValueType result;
+ checkSuccess(cuMemcpyDtoH(&result, CUdeviceptr(mPointer), sizeof(T)));
+ return result;
+ }
+
+ private:
+ T* mPointer;
+};
+}
+
+template <typename T>
+cloth::CuDeviceReference<T> cloth::CuDevicePointer<T>::operator[](const ptrdiff_t& i) const
+{
+ return CuDeviceReference<T>(*this + i);
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h
new file mode 100644
index 00000000..e3997d26
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h
@@ -0,0 +1,258 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "foundation/PxMath.h" // for swap
+#include "cudamanager/PxCudaMemoryManager.h"
+#include "cudamanager/PxCudaContextManager.h"
+#include "CuDevicePointer.h"
+#include "PsArray.h"
+#include "PsUtilities.h"
+
+namespace physx
+{
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch
+#endif
+
+namespace cloth
+{
+
+// STL-style vector that holds POD types in CUDA device memory. The interface
+// is not complete, add whatever you need from the std::vector interface.
+template <typename T>
+class CuDeviceVector
+{
+ public:
+ typedef CuDevicePointer<T> iterator;
+ typedef CuDevicePointer<const T> const_iterator;
+
+ CuDeviceVector(physx::PxCudaContextManager* ctx) : mManager(0)
+ {
+ PX_ASSERT(ctx);
+
+ if(ctx)
+ mManager = ctx->getMemoryManager();
+ }
+
+ CuDeviceVector(const CuDeviceVector& other) : mManager(other.getMemoryManager())
+ {
+ PX_ASSERT(mManager);
+
+ operator=(other);
+ }
+
+ CuDeviceVector(physx::PxCudaContextManager* ctx, const T* first, const T* last) : mManager(0)
+ {
+ PX_ASSERT(ctx);
+
+ if(ctx)
+ {
+ mManager = ctx->getMemoryManager();
+ assign(first, last);
+ }
+ }
+
+ template <typename Alloc>
+ CuDeviceVector(const shdfnd::Array<T, Alloc>& other)
+ {
+ operator=(other);
+ }
+
+ ~CuDeviceVector()
+ {
+ PX_ASSERT(mManager);
+
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, mFirst.dev());
+ }
+
+ CuDeviceVector& operator=(const CuDeviceVector& other)
+ {
+ resize(other.size());
+ checkSuccess(cuMemcpyDtoD(mFirst.dev(), other.mFirst.dev(), other.size() * sizeof(T)));
+ return *this;
+ }
+
+ template <typename Alloc>
+ CuDeviceVector& operator=(const shdfnd::Array<T, Alloc>& other)
+ {
+ const T* first = other.empty() ? 0 : &other.front();
+ assign(first, first + other.size());
+ return *this;
+ }
+
+ bool empty() const
+ {
+ return mLast == mFirst;
+ }
+ size_t size() const
+ {
+ return size_t(mLast - mFirst);
+ }
+ size_t capacity() const
+ {
+ return mEnd - mFirst;
+ }
+
+ iterator begin()
+ {
+ return mFirst;
+ }
+ iterator end()
+ {
+ return mLast;
+ }
+ const_iterator begin() const
+ {
+ return mFirst;
+ }
+ const_iterator end() const
+ {
+ return mLast;
+ }
+
+ void push_back(const T& v)
+ {
+ if(mLast == mEnd)
+ reserve(PxMax<size_t>(1, capacity() * 2));
+
+ *mLast++ = v;
+ }
+
+ void push_back(const T* first, const T* last)
+ {
+ if(mEnd - mLast < last - first)
+ reserve(PxMax<size_t>(2 * capacity(), mLast - mFirst + last - first));
+
+ if(first != last)
+ checkSuccess(cuMemcpyHtoD(mLast.dev(), first, sizeof(T) * (last - first)));
+
+ mLast += last - first;
+ }
+
+ void erase(iterator it)
+ {
+ size_t byteSize = (mLast - it - 1) * sizeof(T);
+ if(byteSize)
+ {
+ CUdeviceptr tmp = 0, dst = it.dev();
+
+ PX_ASSERT(mManager);
+
+ tmp = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, byteSize,
+ PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH));
+ checkSuccess(cuMemcpyDtoD(tmp, dst + sizeof(T), byteSize));
+ checkSuccess(cuMemcpyDtoD(dst, tmp, byteSize));
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, tmp);
+ }
+ --mLast;
+ }
+
+ void reserve(size_t n)
+ {
+ if(n <= capacity())
+ return;
+
+ CUdeviceptr newFirst = 0, oldFirst = mFirst.dev();
+
+ PX_ASSERT(mManager);
+
+ newFirst = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, sizeof(T) * n,
+ PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH));
+ checkSuccess(cuMemcpyDtoD(newFirst, oldFirst, sizeof(T) * size()));
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, oldFirst);
+
+ iterator first(reinterpret_cast<T*>(newFirst));
+ mEnd = first + n;
+ mLast = first + size();
+ mFirst = first;
+ }
+
+ void resize(size_t n)
+ {
+ if(capacity() < n)
+ reserve(PxMax(n, capacity() * 2));
+
+ mLast = mFirst + n;
+ }
+
+ void assign(const T* first, const T* last)
+ {
+ size_t n = last - first;
+ resize(n);
+ checkSuccess(cuMemcpyHtoD(mFirst.dev(), first, n * sizeof(T)));
+ }
+
+ void swap(CuDeviceVector& other)
+ {
+ shdfnd::swap(mFirst, other.mFirst);
+ shdfnd::swap(mLast, other.mLast);
+ shdfnd::swap(mEnd, other.mEnd);
+ }
+
+ // match PxArray interface
+ void remove(size_t i)
+ {
+ erase(begin() + i);
+ }
+ void pushBack(const T& v)
+ {
+ push_back(v);
+ }
+
+ physx::PxCudaMemoryManager* getMemoryManager() const
+ {
+ return mManager;
+ }
+
+ private:
+ iterator mFirst, mLast, mEnd;
+ physx::PxCudaMemoryManager* mManager;
+};
+
+} // namespace cloth
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename T>
+void swap(physx::cloth::CuDeviceVector<T>& first, physx::cloth::CuDeviceVector<T>& second)
+{
+ first.swap(second);
+}
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp
new file mode 100644
index 00000000..7f8326fe
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp
@@ -0,0 +1,197 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuFabric.h"
+#include "CuContextLock.h"
+#include "CuFactory.h"
+
+#if PX_VC
+#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch
+#endif
+
+using namespace physx;
+using namespace shdfnd;
+
+cloth::CuTether::CuTether(uint16_t anchor, uint16_t length) : mAnchor(anchor), mLength(length)
+{
+}
+
+cloth::CuFabric::CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases,
+ Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices,
+ Range<const uint32_t> anchors, Range<const float> tetherLengths,
+ Range<const uint32_t> triangles, uint32_t id)
+: CuContextLock(factory)
+, mFactory(factory)
+, mNumParticles(numParticles)
+, mPhases(mFactory.mContextManager, phases.begin(), phases.end())
+, mSets(mFactory.mContextManager)
+, mRestvalues(mFactory.mContextManager, restvalues.begin(), restvalues.end())
+, mIndices(mFactory.mContextManager)
+, mTethers(mFactory.mContextManager)
+, mTriangles(mFactory.mContextManager)
+, mId(id)
+{
+ // should no longer be prefixed with 0
+ PX_ASSERT(sets.front() != 0);
+
+ PX_ASSERT(sets.back() == restvalues.size());
+ PX_ASSERT(restvalues.size() * 2 == indices.size());
+ PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end()));
+
+ // copy to device, add leading zero
+ mSets.reserve(sets.size() + 1);
+ mSets.push_back(0);
+ mSets.push_back(sets.begin(), sets.end());
+
+ // manually convert uint32_t indices to uint16_t in temp memory
+ Vector<uint16_t>::Type hostIndices;
+ hostIndices.resizeUninitialized(indices.size());
+ Vector<uint16_t>::Type::Iterator dIt = hostIndices.begin();
+
+ const uint32_t* it = indices.begin();
+ const uint32_t* end = indices.end();
+ for(; it != end; ++it, ++dIt)
+ *dIt = uint16_t(*it);
+
+ // copy to device vector in one go
+ mIndices.assign(hostIndices.begin(), hostIndices.end());
+
+ // gather data per phase
+ mNumConstraintsInPhase.reserve(phases.size());
+ CuDevicePointer<const float> devRestvalues = mRestvalues.begin();
+ CuDevicePointer<const uint16_t> devIndices = mIndices.begin();
+ for(const uint32_t* pIt = phases.begin(); pIt != phases.end(); ++pIt)
+ {
+ uint32_t setIndex = *pIt;
+ uint32_t firstIndex = setIndex ? sets[setIndex - 1] : 0;
+ uint32_t lastIndex = sets[setIndex];
+ mNumConstraintsInPhase.pushBack(lastIndex - firstIndex);
+ mRestvaluesInPhase.pushBack(devRestvalues + firstIndex);
+ mIndicesInPhase.pushBack(devIndices + 2 * firstIndex);
+ }
+
+ // tethers
+ PX_ASSERT(anchors.size() == tetherLengths.size());
+ mTetherLengthScale =
+ tetherLengths.empty() ? 1.0f : *maxElement(tetherLengths.begin(), tetherLengths.end()) / USHRT_MAX;
+ float inverseScale = 1 / (mTetherLengthScale + FLT_EPSILON);
+ Vector<CuTether>::Type tethers;
+ tethers.reserve(anchors.size());
+ for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront())
+ {
+ tethers.pushBack(CuTether(uint16_t(anchors.front()), uint16_t(tetherLengths.front() * inverseScale + 0.5f)));
+ }
+ mTethers.assign(tethers.begin(), tethers.end());
+
+ // triangles
+ hostIndices.resizeUninitialized(triangles.size());
+ dIt = hostIndices.begin();
+
+ it = triangles.begin();
+ end = triangles.end();
+ for(; it != end; ++it, ++dIt)
+ *dIt = uint16_t(*it);
+
+ mTriangles.assign(hostIndices.begin(), hostIndices.end());
+
+ CuContextLock::release();
+
+ // add to factory
+ mFactory.mFabrics.pushBack(this);
+}
+
+cloth::CuFabric::~CuFabric()
+{
+ CuContextLock::acquire();
+
+ Vector<CuFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this);
+
+ PX_ASSERT(fIt != mFactory.mFabrics.end());
+ mFactory.mFabrics.replaceWithLast(fIt);
+}
+
+cloth::Factory& physx::cloth::CuFabric::getFactory() const
+{
+ return mFactory;
+}
+
+uint32_t cloth::CuFabric::getNumPhases() const
+{
+ return uint32_t(mPhases.size());
+}
+
+uint32_t cloth::CuFabric::getNumRestvalues() const
+{
+ return uint32_t(mRestvalues.size());
+}
+
+uint32_t cloth::CuFabric::getNumSets() const
+{
+ return uint32_t(mSets.size() - 1);
+}
+
+uint32_t cloth::CuFabric::getNumIndices() const
+{
+ return uint32_t(mIndices.size());
+}
+
+uint32_t cloth::CuFabric::getNumParticles() const
+{
+ return mNumParticles;
+}
+
+uint32_t physx::cloth::CuFabric::getNumTethers() const
+{
+ return uint32_t(mTethers.size());
+}
+
+uint32_t physx::cloth::CuFabric::getNumTriangles() const
+{
+ return uint32_t(mTriangles.size()) / 3;
+}
+
+void physx::cloth::CuFabric::scaleRestvalues(float scale)
+{
+ CuContextLock contextLock(mFactory);
+
+ Vector<float>::Type restvalues(uint32_t(mRestvalues.size()));
+ mFactory.copyToHost(mRestvalues.begin().get(), mRestvalues.end().get(), restvalues.begin());
+
+ Vector<float>::Type::Iterator rIt, rEnd = restvalues.end();
+ for(rIt = restvalues.begin(); rIt != rEnd; ++rIt)
+ *rIt *= scale;
+
+ mRestvalues = restvalues;
+}
+
+void physx::cloth::CuFabric::scaleTetherLengths(float scale)
+{
+ // cloth instances won't pick this up until CuClothData is dirty!
+ mTetherLengthScale *= scale;
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h
new file mode 100644
index 00000000..93f787f8
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h
@@ -0,0 +1,102 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Fabric.h"
+#include "Range.h"
+#include "Types.h"
+#include "Allocator.h"
+#include "CuContextLock.h"
+#include "CuDeviceVector.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+struct CuTether
+{
+ CuTether(uint16_t, uint16_t);
+ uint16_t mAnchor;
+ uint16_t mLength;
+};
+
+class CuFabric : public UserAllocated, private CuContextLock, public Fabric
+{
+ PX_NOCOPY(CuFabric)
+ public:
+ CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+ Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors,
+ Range<const float> tetherLengths, Range<const uint32_t> triangles, uint32_t id);
+
+ virtual ~CuFabric();
+
+ virtual Factory& getFactory() const;
+
+ virtual uint32_t getNumPhases() const;
+ virtual uint32_t getNumRestvalues() const;
+
+ virtual uint32_t getNumSets() const;
+ virtual uint32_t getNumIndices() const;
+
+ virtual uint32_t getNumParticles() const;
+
+ virtual uint32_t getNumTethers() const;
+
+ virtual uint32_t getNumTriangles() const;
+
+ virtual void scaleRestvalues(float);
+ virtual void scaleTetherLengths(float);
+
+ public:
+ CuFactory& mFactory;
+
+ uint32_t mNumParticles;
+
+ CuDeviceVector<uint32_t> mPhases; // index of set to use
+ CuDeviceVector<uint32_t> mSets; // offset of first restvalue, with 0 prefix
+
+ CuDeviceVector<float> mRestvalues;
+ CuDeviceVector<uint16_t> mIndices;
+
+ CuDeviceVector<CuTether> mTethers;
+ float mTetherLengthScale;
+
+ CuDeviceVector<uint16_t> mTriangles;
+
+ Vector<uint32_t>::Type mNumConstraintsInPhase;
+ Vector<CuDevicePointer<const float> >::Type mRestvaluesInPhase;
+ Vector<CuDevicePointer<const uint16_t> >::Type mIndicesInPhase;
+
+ uint32_t mId;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp
new file mode 100644
index 00000000..8847780e
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp
@@ -0,0 +1,398 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMemory.h"
+#include "CuFactory.h"
+#include "CuFabric.h"
+#include "CuCloth.h"
+#include "CuSolver.h"
+#include "ClothImpl.h"
+#include "CuCheckSuccess.h"
+#include "CuContextLock.h"
+#include "PsAllocator.h"
+#include "Array.h"
+#include "PsFoundation.h"
+#include <cuda.h>
+
+#if PX_VC
+#pragma warning(disable : 4061 4062) // enumerator 'identifier' in switch of enum 'enumeration' is not handled
+#endif
+
+using namespace physx;
+using namespace shdfnd;
+
+namespace physx
+{
+namespace cloth
+{
+// defined in Factory.cpp
+uint32_t getNextFabricId();
+
+typedef Vec4T<uint32_t> Vec4u;
+}
+}
+
+void cloth::checkSuccessImpl(CUresult err, const char* file, const int line)
+{
+ if(err != CUDA_SUCCESS)
+ {
+ const char* code = "Unknown";
+ switch(err)
+ {
+#define ADD_CASE(X) \
+ case X: \
+ code = #X; \
+ break
+ ADD_CASE(CUDA_ERROR_INVALID_VALUE);
+ ADD_CASE(CUDA_ERROR_OUT_OF_MEMORY);
+ ADD_CASE(CUDA_ERROR_NOT_INITIALIZED);
+ ADD_CASE(CUDA_ERROR_DEINITIALIZED);
+ ADD_CASE(CUDA_ERROR_NO_DEVICE);
+ ADD_CASE(CUDA_ERROR_INVALID_DEVICE);
+ ADD_CASE(CUDA_ERROR_INVALID_IMAGE);
+ ADD_CASE(CUDA_ERROR_INVALID_CONTEXT);
+ ADD_CASE(CUDA_ERROR_MAP_FAILED);
+ ADD_CASE(CUDA_ERROR_UNMAP_FAILED);
+ ADD_CASE(CUDA_ERROR_ARRAY_IS_MAPPED);
+ ADD_CASE(CUDA_ERROR_ALREADY_MAPPED);
+ ADD_CASE(CUDA_ERROR_NO_BINARY_FOR_GPU);
+ ADD_CASE(CUDA_ERROR_ALREADY_ACQUIRED);
+ ADD_CASE(CUDA_ERROR_NOT_MAPPED);
+ ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY);
+ ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_POINTER);
+ ADD_CASE(CUDA_ERROR_ECC_UNCORRECTABLE);
+ ADD_CASE(CUDA_ERROR_UNSUPPORTED_LIMIT);
+ ADD_CASE(CUDA_ERROR_INVALID_SOURCE);
+ ADD_CASE(CUDA_ERROR_FILE_NOT_FOUND);
+ ADD_CASE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND);
+ ADD_CASE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED);
+ ADD_CASE(CUDA_ERROR_OPERATING_SYSTEM);
+ ADD_CASE(CUDA_ERROR_INVALID_HANDLE);
+ ADD_CASE(CUDA_ERROR_NOT_FOUND);
+ ADD_CASE(CUDA_ERROR_NOT_READY);
+ ADD_CASE(CUDA_ERROR_LAUNCH_FAILED);
+ ADD_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
+ ADD_CASE(CUDA_ERROR_LAUNCH_TIMEOUT);
+ ADD_CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
+ default:
+ ADD_CASE(CUDA_ERROR_UNKNOWN);
+#undef ADD_CASE
+ }
+ shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, file, line, "CUDA error: %s", code);
+ }
+}
+
+namespace
+{
+// returns max threads as specified by launch bounds in CuSolverKernel.cu
+uint32_t getMaxThreadsPerBlock(const physx::PxCudaContextManager& contextManager)
+{
+ if(contextManager.supportsArchSM30())
+ return 1024;
+
+ if(contextManager.supportsArchSM20())
+ return 512;
+
+ return 192;
+}
+}
+
+cloth::CuFactory::CuFactory(physx::PxCudaContextManager* contextManager)
+: Factory(CUDA)
+, mContextManager(contextManager)
+, mNumThreadsPerBlock(getMaxThreadsPerBlock(*contextManager))
+, mMaxThreadsPerBlock(mNumThreadsPerBlock)
+{
+}
+
+cloth::CuFactory::~CuFactory()
+{
+}
+
+cloth::Fabric* cloth::CuFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases,
+ Range<const uint32_t> sets, Range<const float> restvalues,
+ Range<const uint32_t> indices, Range<const uint32_t> anchors,
+ Range<const float> tetherLengths, Range<const uint32_t> triangles)
+{
+ return new CuFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths, triangles,
+ getNextFabricId());
+}
+
+cloth::Cloth* cloth::CuFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
+{
+ return new CuClothImpl(*this, fabric, particles);
+}
+
+cloth::Solver* cloth::CuFactory::createSolver(physx::PxTaskManager*)
+{
+ CuSolver* solver = new CuSolver(*this);
+
+ if(solver->hasError())
+ {
+ delete solver;
+ return NULL;
+ }
+
+ return solver;
+}
+
+// CuFactory::clone() implemented in CuClothClone.cpp
+
+void cloth::CuFactory::copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const
+{
+ CuContextLock contextLock(*this);
+
+ checkSuccess(cuMemcpyDtoH(dstIt, CUdeviceptr(srcIt), size_t(intptr_t(srcEnd) - intptr_t(srcIt))));
+}
+
+void cloth::CuFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+ Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+ Range<float> tetherLengths, Range<uint32_t> triangles) const
+{
+ CuContextLock contextLock(*this);
+
+ const CuFabric& cuFabric = static_cast<const CuFabric&>(fabric);
+
+ if(!phases.empty())
+ {
+ PX_ASSERT(phases.size() == cuFabric.mPhases.size());
+ const uint32_t* devicePhases = cuFabric.mPhases.begin().get();
+ copyToHost(devicePhases, devicePhases + cuFabric.mPhases.size(), phases.begin());
+ }
+
+ if(!restvalues.empty())
+ {
+ PX_ASSERT(restvalues.size() == cuFabric.mRestvalues.size());
+ const float* deviceRestvalues = cuFabric.mRestvalues.begin().get();
+ copyToHost(deviceRestvalues, deviceRestvalues + cuFabric.mRestvalues.size(), restvalues.begin());
+ }
+
+ if(!sets.empty())
+ {
+ PX_ASSERT(sets.size() == cuFabric.mSets.size() - 1);
+ const uint32_t* deviceSets = cuFabric.mSets.begin().get();
+ copyToHost(deviceSets + 1, deviceSets + cuFabric.mSets.size(), sets.begin());
+ }
+
+ if(!indices.empty())
+ {
+ PX_ASSERT(indices.size() == cuFabric.mIndices.size());
+ const uint16_t* deviceIndices = cuFabric.mIndices.begin().get();
+ uint16_t* hostIndices = reinterpret_cast<uint16_t*>(indices.begin());
+ copyToHost(deviceIndices, deviceIndices + cuFabric.mIndices.size(), hostIndices);
+
+ // convert from 16bit to 32bit indices
+ for(uint32_t i = indices.size(); 0 < i--;)
+ indices[i] = hostIndices[i];
+ }
+
+ if(!anchors.empty() || !tetherLengths.empty())
+ {
+ uint32_t numTethers = uint32_t(cuFabric.mTethers.size());
+ Vector<CuTether>::Type tethers(numTethers, CuTether(0, 0));
+ const CuTether* deviceTethers = cuFabric.mTethers.begin().get();
+ copyToHost(deviceTethers, deviceTethers + numTethers, tethers.begin());
+
+ PX_ASSERT(anchors.empty() || anchors.size() == tethers.size());
+ for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront())
+ anchors.front() = tethers[i].mAnchor;
+
+ PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == tethers.size());
+ for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront())
+ tetherLengths.front() = tethers[i].mLength * cuFabric.mTetherLengthScale;
+ }
+
+ if(!triangles.empty())
+ {
+ // todo triangles
+ }
+}
+
+void cloth::CuFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+ Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ PX_ASSERT(spheres.empty() || spheres.size() == cuCloth.mStartCollisionSpheres.size());
+ PX_ASSERT(capsules.empty() || capsules.size() == cuCloth.mCapsuleIndices.size() * 2);
+ PX_ASSERT(planes.empty() || planes.size() == cuCloth.mStartCollisionPlanes.size());
+ PX_ASSERT(convexes.empty() || convexes.size() == cuCloth.mConvexMasks.size());
+ PX_ASSERT(triangles.empty() || triangles.size() == cuCloth.mStartCollisionTriangles.size());
+
+ // collision spheres are in pinned memory, so memcpy directly
+ if(!cuCloth.mStartCollisionSpheres.empty() && !spheres.empty())
+ memcpy(spheres.begin(), &cuCloth.mStartCollisionSpheres.front(),
+ cuCloth.mStartCollisionSpheres.size() * sizeof(PxVec4));
+
+ if(!cuCloth.mCapsuleIndices.empty() && !capsules.empty())
+ memcpy(capsules.begin(), &cuCloth.mCapsuleIndices.front(), cuCloth.mCapsuleIndices.size() * sizeof(IndexPair));
+
+ if(!cuCloth.mStartCollisionPlanes.empty() && !planes.empty())
+ memcpy(planes.begin(), &cuCloth.mStartCollisionPlanes.front(),
+ cuCloth.mStartCollisionPlanes.size() * sizeof(PxVec4));
+
+ if(!cuCloth.mConvexMasks.empty() && !convexes.empty())
+ memcpy(convexes.begin(), &cuCloth.mConvexMasks.front(), cuCloth.mConvexMasks.size() * sizeof(uint32_t));
+
+ if(!cuCloth.mStartCollisionTriangles.empty() && !triangles.empty())
+ memcpy(triangles.begin(), &cuCloth.mStartCollisionTriangles.front(),
+ cuCloth.mStartCollisionTriangles.size() * sizeof(PxVec3));
+}
+
+void cloth::CuFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(cuCloth.mMotionConstraints.mHostCopy.size())
+ {
+ PX_ASSERT(destConstraints.size() == cuCloth.mMotionConstraints.mHostCopy.size());
+
+ PxMemCopy(destConstraints.begin(), cuCloth.mMotionConstraints.mHostCopy.begin(),
+ sizeof(PxVec4) * cuCloth.mMotionConstraints.mHostCopy.size());
+ }
+ else
+ {
+ CuContextLock contextLock(*this);
+
+ CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mMotionConstraints.mTarget.empty()
+ ? cuCloth.mMotionConstraints.mTarget
+ : cuCloth.mMotionConstraints.mStart;
+
+ PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+ copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin());
+ }
+}
+
+void cloth::CuFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(cuCloth.mSeparationConstraints.mHostCopy.size())
+ {
+ PX_ASSERT(destConstraints.size() == cuCloth.mSeparationConstraints.mHostCopy.size());
+
+ PxMemCopy(destConstraints.begin(), cuCloth.mSeparationConstraints.mHostCopy.begin(),
+ sizeof(PxVec4) * cuCloth.mSeparationConstraints.mHostCopy.size());
+ }
+ else
+ {
+ CuContextLock contextLock(*this);
+
+ CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mSeparationConstraints.mTarget.empty()
+ ? cuCloth.mSeparationConstraints.mTarget
+ : cuCloth.mSeparationConstraints.mStart;
+
+ PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+ copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin());
+ }
+}
+
+void cloth::CuFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(cuCloth.mParticleAccelerationsHostCopy.size())
+ {
+ PX_ASSERT(destAccelerations.size() == cuCloth.mParticleAccelerationsHostCopy.size());
+
+ PxMemCopy(destAccelerations.begin(), cuCloth.mParticleAccelerationsHostCopy.begin(),
+ sizeof(PxVec4) * cuCloth.mParticleAccelerationsHostCopy.size());
+ }
+}
+
+void cloth::CuFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+ Range<PxVec3> destWeights) const
+{
+ PX_ASSERT(&cloth.getFactory() == this);
+
+ CuContextLock contextLock(*this);
+
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+
+ if(destWeights.size() > 0)
+ {
+ uint32_t numWeights = cloth.getNumVirtualParticleWeights();
+
+ Vector<PxVec4>::Type hostWeights(numWeights, PxVec4(0.0f));
+ copyToHost(cuCloth.mVirtualParticleWeights.begin().get(), cuCloth.mVirtualParticleWeights.end().get(),
+ &hostWeights.front());
+
+ // convert weights to Vec3f
+ PxVec3* destIt = reinterpret_cast<PxVec3*>(destWeights.begin());
+ Vector<PxVec4>::Type::ConstIterator srcIt = hostWeights.begin();
+ Vector<PxVec4>::Type::ConstIterator srcEnd = srcIt + numWeights;
+ for(; srcIt != srcEnd; ++srcIt, ++destIt)
+ *destIt = reinterpret_cast<const PxVec3&>(*srcIt);
+
+ PX_ASSERT(destIt <= destWeights.end());
+ }
+
+ if(destIndices.size() > 0)
+ {
+ uint32_t numIndices = cloth.getNumVirtualParticles();
+
+ Vector<Vec4us>::Type hostIndices(numIndices);
+ copyToHost(cuCloth.mVirtualParticleIndices.begin().get(), cuCloth.mVirtualParticleIndices.end().get(),
+ &hostIndices.front());
+
+ // convert indices to 32 bit
+ Vec4u* destIt = reinterpret_cast<Vec4u*>(destIndices.begin());
+ Vector<Vec4us>::Type::ConstIterator srcIt = hostIndices.begin();
+ Vector<Vec4us>::Type::ConstIterator srcEnd = srcIt + numIndices;
+ for(; srcIt != srcEnd; ++srcIt, ++destIt)
+ *destIt = Vec4u(*srcIt);
+
+ PX_ASSERT(&array(*destIt) <= destIndices.end());
+ }
+}
+
+void cloth::CuFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const
+{
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+ PX_ASSERT(destIndices.size() == cuCloth.mSelfCollisionIndices.size());
+ copyToHost(cuCloth.mSelfCollisionIndices.begin().get(), cuCloth.mSelfCollisionIndices.end().get(),
+ destIndices.begin());
+}
+
+void cloth::CuFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const
+{
+ const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth;
+ PX_ASSERT(destRestPositions.size() == cuCloth.mRestPositions.size());
+ copyToHost(cuCloth.mRestPositions.begin().get(), cuCloth.mRestPositions.end().get(), destRestPositions.begin());
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h
new file mode 100644
index 00000000..e868034f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h
@@ -0,0 +1,107 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+namespace physx
+{
+class PxCudaContextManager;
+}
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class CuFabric;
+class CuCloth;
+template <typename>
+class ClothImpl;
+
+class CuFactory : public UserAllocated, public Factory
+{
+ protected:
+ CuFactory& operator=(const CuFactory&);
+
+ public:
+ typedef CuFabric FabricType;
+ typedef ClothImpl<CuCloth> ImplType;
+
+ CuFactory(physx::PxCudaContextManager*);
+ virtual ~CuFactory();
+
+ virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+ Range<const float> restvalues, Range<const uint32_t> indices,
+ Range<const uint32_t> anchors, Range<const float> tetherLengths,
+ Range<const uint32_t> triangles);
+
+ virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric);
+
+ virtual Solver* createSolver(physx::PxTaskManager* taskMgr);
+
+ virtual Cloth* clone(const Cloth& cloth);
+
+ virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+ Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+ Range<float> tetherLengths, Range<uint32_t> triangles) const;
+
+ virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+ Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const;
+
+ virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+ virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+ virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const;
+
+ virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+ Range<PxVec3> destWeights) const;
+
+ virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const;
+
+ virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const;
+
+ public:
+ void copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const;
+
+ public:
+ Vector<CuFabric*>::Type mFabrics;
+
+ physx::PxCudaContextManager* mContextManager;
+
+ uint32_t mNumThreadsPerBlock;
+
+ const uint32_t mMaxThreadsPerBlock;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h
new file mode 100644
index 00000000..74470bde
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h
@@ -0,0 +1,51 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+namespace cloth
+{
+
+struct CuPhaseConfig
+{
+ float mStiffness;
+ float mStiffnessMultiplier;
+ float mCompressionLimit;
+ float mStretchLimit;
+
+ uint32_t mNumConstraints;
+ const float* mRestvalues;
+ const uint16_t* mIndices;
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h
new file mode 100644
index 00000000..57dd6731
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h
@@ -0,0 +1,132 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "cudamanager/PxCudaContextManager.h"
+#include "cudamanager/PxCudaMemoryManager.h"
+#include "Allocator.h"
+#include "CuCheckSuccess.h"
+#include <cuda.h>
+
+namespace physx
+{
+
+namespace cloth
+{
+
+struct CuHostAllocator
+{
+ CuHostAllocator(physx::PxCudaContextManager* ctx = NULL, unsigned int flags = cudaHostAllocDefault)
+ : mDevicePtr(0), mFlags(flags), mManager(0)
+ {
+ PX_ASSERT(ctx);
+
+ if(ctx)
+ mManager = ctx->getMemoryManager();
+ }
+
+ void* allocate(size_t n, const char*, int)
+ {
+ physx::PxCudaBufferPtr bufferPtr;
+
+ PX_ASSERT(mManager);
+
+ if(mFlags & cudaHostAllocWriteCombined)
+ bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, n,
+ PX_ALLOC_INFO("cloth::CuHostAllocator::T_WRITE_COMBINED", CLOTH));
+ else if(mFlags & cudaHostAllocMapped)
+ bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, n,
+ PX_ALLOC_INFO("cloth::CuHostAllocator::T_PINNED_HOST", CLOTH));
+ else
+ bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_HOST, n,
+ PX_ALLOC_INFO("cloth::CuHostAllocator::T_HOST", CLOTH));
+
+ if(mFlags & cudaHostAllocMapped)
+ checkSuccess(cuMemHostGetDevicePointer(&mDevicePtr, reinterpret_cast<void*>(bufferPtr), 0));
+
+ return reinterpret_cast<void*>(bufferPtr);
+ }
+
+ void deallocate(void* p)
+ {
+ PX_ASSERT(mManager);
+
+ if(mFlags & cudaHostAllocWriteCombined)
+ mManager->free(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, physx::PxCudaBufferPtr(p));
+ else if(mFlags & cudaHostAllocMapped)
+ mManager->free(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, physx::PxCudaBufferPtr(p));
+ else
+ mManager->free(physx::PxCudaBufferMemorySpace::T_HOST, physx::PxCudaBufferPtr(p));
+
+ // don't reset mDevicePtr because Array::recreate deallocates last
+ }
+
+ CUdeviceptr mDevicePtr; // device pointer of last allocation
+ unsigned int mFlags;
+ physx::PxCudaMemoryManager* mManager;
+};
+
+template <typename T>
+CuHostAllocator getMappedAllocator(physx::PxCudaContextManager* ctx)
+{
+ return CuHostAllocator(ctx, cudaHostAllocMapped | cudaHostAllocWriteCombined);
+}
+
+template <typename T>
+struct CuPinnedVector
+{
+ // note: always use shdfnd::swap() instead of Array::swap()
+ // in order to keep cached device pointer consistent
+ typedef shdfnd::Array<T, typename physx::cloth::CuHostAllocator> Type;
+};
+
+template <typename T>
+T* getDevicePointer(shdfnd::Array<T, typename physx::cloth::CuHostAllocator>& vector)
+{
+ // cached device pointer only valid if non-empty
+ return vector.empty() ? 0 : reinterpret_cast<T*>(vector.getAllocator().mDevicePtr);
+}
+
+} // namespace cloth
+
+} // namespace physx
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename T>
+void swap(Array<T, typename physx::cloth::CuHostAllocator>& left, Array<T, typename physx::cloth::CuHostAllocator>& right)
+{
+ swap(left.getAllocator(), right.getAllocator());
+ left.swap(right);
+}
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
new file mode 100644
index 00000000..fb0fd7af
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
@@ -0,0 +1,472 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifndef CU_SOLVER_KERNEL_CU
+#error include CuSelfCollision.h only from CuSolverKernel.cu
+#endif
+
+#ifndef UINT16_MAX
+#define UINT16_MAX 0xffff
+#endif
+
+namespace
+{
+#if __CUDA_ARCH__ >= 300
+template <int>
+__device__ void scanWarp(Pointer<Shared, int32_t> counts)
+{
+ asm volatile("{"
+ " .reg .s32 tmp;"
+ " .reg .pred p;"
+ " shfl.up.b32 tmp|p, %0, 0x01, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x02, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x04, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x08, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x10, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ "}"
+ : "+r"(*generic(counts))
+ :);
+}
+#else
+template <int stride>
+__device__ void scanWarp(Pointer<Shared, int32_t> counts)
+{
+ volatile int32_t* ptr = generic(counts);
+ const int32_t laneIdx = threadIdx.x & warpSize - 1;
+ if(laneIdx >= 1)
+ *ptr += ptr[-stride];
+ if(laneIdx >= 2)
+ *ptr += ptr[-2 * stride];
+ if(laneIdx >= 4)
+ *ptr += ptr[-4 * stride];
+ if(laneIdx >= 8)
+ *ptr += ptr[-8 * stride];
+ if(laneIdx >= 16)
+ *ptr += ptr[-16 * stride];
+}
+#endif
+
+// sorts array by upper 16bits
+// [keys] must be at least 2*n in length, in/out in first n elements
+// [histogram] must be at least 34*16 = 544 in length
+__device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> histogram)
+{
+ const int32_t numWarps = blockDim.x >> 5;
+ const int32_t warpIdx = threadIdx.x >> 5;
+ const int32_t laneIdx = threadIdx.x & warpSize - 1;
+
+ const uint32_t laneMask = (1u << laneIdx) - 1;
+ const uint32_t mask1 = (threadIdx.x & 1) - 1;
+ const uint32_t mask2 = !!(threadIdx.x & 2) - 1;
+ const uint32_t mask4 = !!(threadIdx.x & 4) - 1;
+ const uint32_t mask8 = !!(threadIdx.x & 8) - 1;
+
+ const int32_t tn = (n + blockDim.x - 1) / blockDim.x;
+ const int32_t startIndex = tn * (threadIdx.x - laneIdx) + laneIdx;
+ const int32_t endIndex = min(startIndex + tn * warpSize, n + 31 & ~31); // full warps for ballot
+
+ int32_t* srcKeys = keys;
+ int32_t* dstKeys = keys + n;
+
+ Pointer<Shared, int32_t> hIt = histogram + 16 * warpIdx;
+ Pointer<Shared, int32_t> pIt = histogram + 16 * laneIdx + 16;
+ Pointer<Shared, int32_t> tIt = histogram + 16 * numWarps + laneIdx;
+
+ for(int32_t p = 16; p < 32; p += 4) // radix passes (4 bits each)
+ {
+ // gather bucket histograms per warp
+ int32_t warpCount = 0;
+ for(int32_t i = startIndex; i < endIndex; i += 32)
+ {
+ int32_t key = i < n ? srcKeys[i] >> p : 15;
+ uint32_t ballot1 = __ballot(key & 1);
+ uint32_t ballot2 = __ballot(key & 2);
+ uint32_t ballot4 = __ballot(key & 4);
+ uint32_t ballot8 = __ballot(key & 8);
+ warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
+ }
+
+ if(laneIdx >= 16)
+ hIt[laneIdx] = warpCount;
+
+ __syncthreads();
+
+ // prefix sum of histogram buckets
+ for(int32_t i = warpIdx; i < 16; i += numWarps)
+ scanWarp<16>(pIt + i);
+
+ __syncthreads();
+
+ // prefix sum of bucket totals (exclusive)
+ if(threadIdx.x < 16)
+ {
+ *tIt = tIt[-1] & !threadIdx.x - 1;
+ scanWarp<1>(tIt);
+ hIt[threadIdx.x] = 0;
+ }
+
+ __syncthreads();
+
+ if(laneIdx < 16)
+ hIt[laneIdx] += *tIt;
+
+ // split indices
+ for(int32_t i = startIndex; i < endIndex; i += 32)
+ {
+ int32_t key = i < n ? srcKeys[i] >> p : 15;
+ uint32_t ballot1 = __ballot(key & 1);
+ uint32_t ballot2 = __ballot(key & 2);
+ uint32_t ballot4 = __ballot(key & 4);
+ uint32_t ballot8 = __ballot(key & 8);
+ uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) &
+ (!!(key & 8) - 1 ^ ballot8);
+ int32_t index = hIt[key & 15] + __popc(bits & laneMask);
+
+ if(i < n)
+ dstKeys[index] = srcKeys[i];
+
+ if(laneIdx < 16)
+ hIt[laneIdx] += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
+ }
+
+ __syncthreads();
+
+ ::swap(srcKeys, dstKeys);
+ }
+
+#ifndef NDEBUG
+ for(int32_t i = threadIdx.x; i < n; i += blockDim.x)
+ assert(!i || keys[i - 1] >> 16 <= keys[i] >> 16);
+#endif
+}
+}
+
+namespace
+{
+struct CuSelfCollision
+{
+ template <typename CurrentT>
+ __device__ void operator()(CurrentT& current);
+
+ private:
+ template <typename CurrentT>
+ __device__ void buildAcceleration(const CurrentT& current);
+ template <bool useRestPositions, typename CurrentT>
+ __device__ void collideParticles(CurrentT& current) const;
+
+ public:
+ float mPosBias[3];
+ float mPosScale[3];
+ const float* mPosPtr[3];
+};
+}
+
+__shared__ uninitialized<CuSelfCollision> gSelfCollideParticles;
+
+template <typename CurrentT>
+__device__ void CuSelfCollision::operator()(CurrentT& current)
+{
+ if(min(gClothData.mSelfCollisionDistance, gFrameData.mSelfCollisionStiffness) <= 0.0f)
+ return;
+
+ if(threadIdx.x < 3)
+ {
+ float upper = gFrameData.mParticleBounds[threadIdx.x * 2];
+ float negativeLower = gFrameData.mParticleBounds[threadIdx.x * 2 + 1];
+
+ // expand bounds
+ float eps = (upper + negativeLower) * 1e-4f;
+ float expandedUpper = upper + eps;
+ float expandedNegativeLower = negativeLower + eps;
+ float expandedEdgeLength = expandedUpper + expandedNegativeLower;
+
+ float* edgeLength = mPosBias; // use as temp
+ edgeLength[threadIdx.x] = expandedEdgeLength;
+
+ __threadfence_block();
+
+ // calculate shortest axis
+ int32_t shortestAxis = edgeLength[0] > edgeLength[1];
+ if(edgeLength[shortestAxis] > edgeLength[2])
+ shortestAxis = 2;
+
+ uint32_t writeAxis = threadIdx.x - shortestAxis;
+ writeAxis += writeAxis >> 30;
+
+ float maxInvCellSize = __fdividef(127.0f, expandedEdgeLength);
+ float invCollisionDistance = __fdividef(1.0f, gClothData.mSelfCollisionDistance);
+ float invCellSize = min(maxInvCellSize, invCollisionDistance);
+
+ mPosScale[writeAxis] = invCellSize;
+ mPosBias[writeAxis] = invCellSize * expandedNegativeLower;
+ mPosPtr[writeAxis] = generic(current[threadIdx.x]);
+ }
+
+ __syncthreads();
+
+ buildAcceleration(current);
+
+ if(gFrameData.mRestPositions)
+ collideParticles<true>(current);
+ else
+ collideParticles<false>(current);
+}
+
+template <typename CurrentT>
+__device__ void CuSelfCollision::buildAcceleration(const CurrentT& current)
+{
+ int32_t numIndices = gClothData.mNumSelfCollisionIndices;
+ const int32_t* indices = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionIndices);
+ int32_t* sortedKeys = reinterpret_cast<int32_t*>(gClothData.mSelfCollisionKeys);
+ int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart);
+
+ typedef typename CurrentT::ConstPointerType ConstPointerType;
+ ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]);
+ ConstPointerType colPtr = ConstPointerType(mPosPtr[2]);
+
+ float rowScale = mPosScale[1], rowBias = mPosBias[1];
+ float colScale = mPosScale[2], colBias = mPosBias[2];
+
+ // calculate keys
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t index = indices ? indices[i] : i;
+ assert(index < gClothData.mNumParticles);
+
+ int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[index] * rowScale + rowBias, 127.5f)));
+ int32_t colIndex = int32_t(max(0.0f, min(colPtr[index] * colScale + colBias, 127.5f)));
+ assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128);
+
+ int32_t key = (colIndex << 7 | rowIndex) + 129; // + row and column sentinel
+ assert(key <= 0x4080);
+
+ sortedKeys[i] = key << 16 | index; // (key, index) pair in a single int32_t
+ }
+ __syncthreads();
+
+ // get scratch shared mem buffer used for radix sort(histogram)
+ Pointer<Shared, int32_t> buffer =
+ reinterpret_cast<Pointer<Shared, int32_t> const&>(gCollideParticles.get().mCurData.mSphereX);
+
+ // sort keys (__synchthreads inside radix sort)
+ radixSort(sortedKeys, numIndices, buffer);
+
+ // mark cell start if keys are different between neighboring threads
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t key = sortedKeys[i] >> 16;
+ int32_t prevKey = i ? sortedKeys[i - 1] >> 16 : key - 1;
+ if(key != prevKey)
+ {
+ cellStart[key] = i;
+ cellStart[prevKey + 1] = i;
+ }
+ }
+ __syncthreads();
+}
+
+template <bool useRestPositions, typename CurrentT>
+__device__ void CuSelfCollision::collideParticles(CurrentT& current) const
+{
+ const int32_t* sortedKeys = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionKeys);
+ float* sortedParticles = gClothData.mSelfCollisionParticles;
+ int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart);
+
+ const float cdist = gClothData.mSelfCollisionDistance;
+ const float cdistSq = cdist * cdist;
+
+ const int32_t numIndices = gClothData.mNumSelfCollisionIndices;
+ const int32_t numParticles = gClothData.mNumParticles;
+
+ // point to particle copied in device memory that is being updated
+ float* xPtr = sortedParticles;
+ float* yPtr = sortedParticles + numParticles;
+ float* zPtr = sortedParticles + 2 * numParticles;
+ float* wPtr = sortedParticles + 3 * numParticles;
+
+ // copy current particles to temporary array
+ for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x)
+ {
+ xPtr[i] = current(i, 0);
+ yPtr[i] = current(i, 1);
+ zPtr[i] = current(i, 2);
+ wPtr[i] = current(i, 3);
+ }
+ __syncthreads();
+
+ // copy only sorted (indexed) particles to shared mem
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t index = sortedKeys[i] & UINT16_MAX;
+ current(i, 0) = xPtr[index];
+ current(i, 1) = yPtr[index];
+ current(i, 2) = zPtr[index];
+ current(i, 3) = wPtr[index];
+ }
+ __syncthreads();
+
+ typedef typename CurrentT::ConstPointerType ConstPointerType;
+ ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]);
+ ConstPointerType colPtr = ConstPointerType(mPosPtr[2]);
+
+ float rowScale = mPosScale[1], rowBias = mPosBias[1];
+ float colScale = mPosScale[2], colBias = mPosBias[2];
+
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ const int32_t index = sortedKeys[i] & UINT16_MAX;
+ assert(index < gClothData.mNumParticles);
+
+ float restX, restY, restZ;
+ if(useRestPositions)
+ {
+ const float* restIt = gFrameData.mRestPositions + index * 4;
+ restX = restIt[0];
+ restY = restIt[1];
+ restZ = restIt[2];
+ }
+
+ float posX = current(i, 0);
+ float posY = current(i, 1);
+ float posZ = current(i, 2);
+ float posW = current(i, 3);
+
+ float deltaX = 0.0f;
+ float deltaY = 0.0f;
+ float deltaZ = 0.0f;
+ float deltaW = FLT_EPSILON;
+
+ // get cell index for this particle
+ int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[i] * rowScale + rowBias, 127.5f)));
+ int32_t colIndex = int32_t(max(0.0f, min(colPtr[i] * colScale + colBias, 127.5f)));
+ assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128);
+
+ int32_t key = colIndex << 7 | rowIndex;
+ assert(key <= 0x4080);
+
+ // check cells in 3 columns
+ for(int32_t keyEnd = key + 256; key <= keyEnd; key += 128)
+ {
+ // cellStart keys of unoccupied cells have a value of -1
+ uint32_t startIndex; // min<unsigned>(cellStart[key+0..2])
+ uint32_t endIndex; // max<signed>(0, cellStart[key+1..3])
+
+ asm volatile("{\n\t"
+ " .reg .u32 start1, start2;\n\t"
+ " ld.global.s16 %1, [%2+6];\n\t"
+ " ld.global.s16 %0, [%2+0];\n\t"
+ " ld.global.s16 start1, [%2+2];\n\t"
+ " ld.global.s16 start2, [%2+4];\n\t"
+ " max.s32 %1, %1, 0;\n\t"
+ " min.u32 %0, %0, start1;\n\t"
+ " max.s32 %1, %1, start1;\n\t"
+ " min.u32 %0, %0, start2;\n\t"
+ " max.s32 %1, %1, start2;\n\t"
+ "}\n\t"
+ : "=r"(startIndex), "=r"(endIndex)
+ : POINTER_CONSTRAINT(cellStart + key));
+
+ // comparison must be unsigned to skip cells with negative startIndex
+ for(uint32_t j = startIndex; j < endIndex; ++j)
+ {
+ if(j != i) // avoid same particle
+ {
+ float dx = posX - current(j, 0);
+ float dy = posY - current(j, 1);
+ float dz = posZ - current(j, 2);
+
+ float distSqr = dx * dx + dy * dy + dz * dz;
+ if(distSqr > cdistSq)
+ continue;
+
+ if(useRestPositions)
+ {
+ const int32_t jndex = sortedKeys[j] & UINT16_MAX;
+ assert(jndex < gClothData.mNumParticles);
+
+ // calculate distance in rest configuration
+ const float* restJt = gFrameData.mRestPositions + jndex * 4;
+ float rx = restX - restJt[0];
+ float ry = restY - restJt[1];
+ float rz = restZ - restJt[2];
+
+ if(rx * rx + ry * ry + rz * rz <= cdistSq)
+ continue;
+ }
+
+ // premultiply ratio for weighted average
+ float ratio = fmaxf(0.0f, cdist * rsqrtf(FLT_EPSILON + distSqr) - 1.0f);
+ float scale = __fdividef(ratio * ratio, FLT_EPSILON + posW + current(j, 3));
+
+ deltaX += scale * dx;
+ deltaY += scale * dy;
+ deltaZ += scale * dz;
+ deltaW += ratio;
+ }
+ }
+ }
+
+ const float stiffness = gFrameData.mSelfCollisionStiffness * posW;
+ float scale = __fdividef(stiffness, deltaW);
+
+ // apply collision impulse
+ xPtr[index] += deltaX * scale;
+ yPtr[index] += deltaY * scale;
+ zPtr[index] += deltaZ * scale;
+
+ assert(!isnan(xPtr[index] + yPtr[index] + zPtr[index]));
+ }
+ __syncthreads();
+
+ // copy temporary particle array back to shared mem
+ // (need to copy whole array)
+ for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x)
+ {
+ current(i, 0) = xPtr[i];
+ current(i, 1) = yPtr[i];
+ current(i, 2) = zPtr[i];
+ current(i, 3) = wPtr[i];
+ }
+
+ // unmark occupied cells to empty again (faster than clearing all the cells)
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t key = sortedKeys[i] >> 16;
+ cellStart[key] = 0xffff;
+ cellStart[key + 1] = 0xffff;
+ }
+ __syncthreads();
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp
new file mode 100644
index 00000000..68238664
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp
@@ -0,0 +1,556 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "CuSolver.h"
+#include "CuCloth.h"
+#include "ClothImpl.h"
+#include "CuFabric.h"
+#include "CuFactory.h"
+#include "CuSolverKernel.h"
+#include "CuContextLock.h"
+#include "CuCheckSuccess.h"
+#include "IterationState.h"
+#include "CudaKernelWrangler.h"
+#include "PsUtilities.h"
+#include "PsSort.h"
+#include "PsFoundation.h"
+
+#if PX_NVTX
+#include "nvToolsExt.h"
+#endif
+
+//#define ENABLE_CUDA_PRINTF PX_DEBUG // warning: not thread safe
+#define ENABLE_CUDA_PRINTF 0
+
+#if ENABLE_CUDA_PRINTF
+extern "C" cudaError_t cudaPrintfInit(CUmodule hmod, size_t bufferLen = 1048576);
+extern "C" void cudaPrintfEnd();
+extern "C" cudaError_t cudaPrintfDisplay(CUmodule hmod, void* outputFP = NULL, bool showThreadID = false);
+#endif
+
+using namespace physx;
+
+namespace
+{
+//for KernelWrangler interface
+const char* gKernelName = cloth::getKernelFunctionName();
+}
+
+namespace
+{
+template <typename T>
+struct CuDeviceAllocator
+{
+ CuDeviceAllocator(physx::PxCudaContextManager* ctx) : mManager(ctx->getMemoryManager())
+ {
+ }
+
+ T* allocate(size_t n)
+ {
+ return reinterpret_cast<T*>(mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, n * sizeof(T)));
+ }
+
+ void deallocate(T* ptr)
+ {
+ mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, reinterpret_cast<physx::PxCudaBufferPtr>(ptr));
+ }
+
+ physx::PxCudaMemoryManager* mManager;
+};
+}
+
+cloth::CuSolver::CuSolver(CuFactory& factory)
+: CuContextLock(factory)
+, mFactory(factory)
+, mClothData(mFactory.mContextManager)
+, mClothDataHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocWriteCombined))
+, mClothDataDirty(false)
+, mFrameData(getMappedAllocator<CuFrameData>(mFactory.mContextManager))
+, mIterationData(getMappedAllocator<CuIterationData>(mFactory.mContextManager))
+, mIterationDataBegin(0)
+, mFrameDt(0.0f)
+, mSharedMemorySize(0)
+, mSharedMemoryLimit(0)
+, mStartSimulationTask(&CuSolver::beginFrame, "cloth.CuSolver.startSimulation")
+, mKernelSimulationTask(&CuSolver::executeKernel, "cloth.CuSolver.kernelSimulation")
+, mEndSimulationTask(&CuSolver::endFrame, "cloth.CuSolver.endSimulation")
+, mStream(0)
+, mKernelModule(0)
+, mKernelFunction(0)
+, mKernelSharedMemorySize(0)
+, mClothIndex(CuDeviceAllocator<uint32_t>(mFactory.mContextManager).allocate(1))
+, mInterCollisionDistance(0.0f)
+, mInterCollisionStiffness(1.0f)
+, mInterCollisionIterations(1)
+, mInterCollisionScratchMem(NULL)
+, mInterCollisionScratchMemSize(0)
+, mKernelWrangler(getDispatcher(), physx::shdfnd::getFoundation().getErrorCallback(), &gKernelName, 1)
+, mSimulateNvtxRangeId(0)
+, mCudaError(mKernelWrangler.hadError())
+{
+ if(mCudaError)
+ {
+ CuContextLock::release();
+ return;
+ }
+
+ mStartSimulationTask.mSolver = this;
+ mKernelSimulationTask.mSolver = this;
+ mEndSimulationTask.mSolver = this;
+
+ if(mFactory.mContextManager->getUsingConcurrentStreams())
+ checkSuccess(cuStreamCreate(&mStream, 0));
+
+ if(1)
+ {
+ mKernelModule = mKernelWrangler.getCuModule(0);
+ mKernelFunction = mKernelWrangler.getCuFunction(0);
+ }
+ else
+ {
+ // load from ptx instead of embedded SASS, for iterating without recompile
+ checkSuccess(cuModuleLoad(&mKernelModule, "CuSolverKernel.ptx"));
+ checkSuccess(cuModuleGetFunction(&mKernelFunction, mKernelModule, getKernelFunctionName()));
+ shdfnd::getFoundation().error(PX_INFO, "Cloth kernel code loaded from CuSolverKernel.ptx");
+ }
+
+ // get amount of statically allocated shared memory
+ checkSuccess(cuFuncGetAttribute(&mKernelSharedMemorySize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, mKernelFunction));
+
+ // extract CuKernelData device pointer
+ size_t size = 0;
+ CUdeviceptr ptr = 0;
+ checkSuccess(cuModuleGetGlobal(&ptr, &size, mKernelModule, getKernelDataName()));
+ mKernelData = CuDevicePointer<CuKernelData>(reinterpret_cast<CuKernelData*>(ptr));
+
+ // initialize cloth index
+ checkSuccess(cuMemsetD32(mClothIndex.dev(), 0, 1));
+
+ CuContextLock::release();
+}
+
+cloth::CuSolver::~CuSolver()
+{
+ PX_ASSERT(mCloths.empty());
+
+ CuContextLock::acquire();
+
+ CuKernelData kernelData = {};
+ *mKernelData = kernelData;
+
+ CuDeviceAllocator<uint32_t>(mFactory.mContextManager).deallocate(mClothIndex.get());
+
+ if(mStream)
+ checkSuccess(cuStreamDestroy(mStream));
+
+ if(mInterCollisionScratchMem)
+ PX_FREE(mInterCollisionScratchMem);
+}
+
+void cloth::CuSolver::updateKernelData()
+{
+ CuKernelData kernelData;
+
+ kernelData.mClothIndex = mClothIndex.get();
+ kernelData.mClothData = mClothData.begin().get();
+ kernelData.mFrameData = getDevicePointer(mFrameData);
+
+ *mKernelData = kernelData;
+}
+
+physx::PxGpuDispatcher& cloth::CuSolver::getDispatcher() const
+{
+ return *mFactory.mContextManager->getGpuDispatcher();
+}
+
+namespace
+{
+struct ClothSimCostGreater
+{
+ bool operator()(const cloth::CuCloth* left, const cloth::CuCloth* right) const
+ {
+ return left->mNumParticles * left->mSolverFrequency > right->mNumParticles * right->mSolverFrequency;
+ }
+};
+}
+
+void cloth::CuSolver::addCloth(Cloth* cloth)
+{
+ CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth;
+
+ PX_ASSERT(mCloths.find(&cuCloth) == mCloths.end());
+
+ mCloths.pushBack(&cuCloth);
+ // trigger update of mClothData array
+ cuCloth.notifyChanged();
+
+ // sort cloth instances by size
+ shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater());
+
+ CuContextLock contextLock(mFactory);
+
+ // resize containers and update kernel data
+ mClothDataHostCopy.resize(mCloths.size());
+ mClothData.resize(mCloths.size());
+ mFrameData.resize(mCloths.size());
+ updateKernelData();
+}
+
+void cloth::CuSolver::removeCloth(Cloth* cloth)
+{
+ CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth;
+
+ ClothVector::Iterator begin = mCloths.begin(), end = mCloths.end();
+ ClothVector::Iterator it = mCloths.find(&cuCloth);
+
+ if(it == end)
+ return; // not found
+
+ uint32_t index = uint32_t(it - begin);
+
+ mCloths.remove(index);
+ mClothDataHostCopy.remove(index);
+ mClothData.resize(mCloths.size());
+ mClothDataDirty = true;
+}
+
+physx::PxBaseTask& cloth::CuSolver::simulate(float dt, physx::PxBaseTask& continuation)
+{
+ mFrameDt = dt;
+
+ if(mCloths.empty() || mCudaError)
+ {
+ continuation.addReference();
+ return continuation;
+ }
+
+ physx::PxGpuDispatcher& disp = getDispatcher();
+ mEndSimulationTask.setContinuation(&continuation);
+ disp.addPostLaunchDependent(mEndSimulationTask);
+ mKernelSimulationTask.setContinuation(&disp.getPostLaunchTask());
+ disp.getPostLaunchTask().removeReference();
+ disp.addPreLaunchDependent(mKernelSimulationTask);
+ mStartSimulationTask.setContinuation(&disp.getPreLaunchTask());
+ disp.getPreLaunchTask().removeReference();
+
+ mEndSimulationTask.removeReference();
+ mKernelSimulationTask.removeReference();
+
+ return mStartSimulationTask;
+}
+
+void cloth::CuSolver::beginFrame()
+{
+ CuContextLock contextLock(mFactory);
+
+ PX_PROFILE_START_CROSSTHREAD("cloth.CuSolver.simulate", 0);
+
+ CuIterationData* iterationDataBegin = mIterationData.empty() ? 0 : &mIterationData.front();
+
+ mFrameData.resize(0);
+ mIterationData.resize(0);
+
+ // update cloth data
+ ClothVector::Iterator cIt, cEnd = mCloths.end();
+ CuPinnedVector<CuClothData>::Type::Iterator dIt = mClothDataHostCopy.begin();
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++dIt)
+ mClothDataDirty |= (*cIt)->updateClothData(*dIt);
+
+ if(mClothDataDirty)
+ {
+ /* find optimal number of cloths per SM */
+
+ // at least 192 threads per block (e.g. CuCollision::buildAcceleration)
+ uint32_t numSMs = (uint32_t)mFactory.mContextManager->getMultiprocessorCount();
+ uint32_t maxClothsPerSM = PxMin(mFactory.mMaxThreadsPerBlock / 192, (mCloths.size() + numSMs - 1) / numSMs);
+
+ // tuning parameters: relative performance per numSharedPositions
+ float weights[3] = { 0.4f, 0.8f, 1.0f };
+
+ // try all possible number of cloths per SM and estimate performance
+ float maxWeightSum = 0.0f;
+ uint32_t numClothsPerSM = 0;
+ for(uint32_t i = 1; i <= maxClothsPerSM; ++i)
+ {
+ uint32_t sharedMemoryLimit = (mFactory.mContextManager->getSharedMemPerBlock() / i) - mKernelSharedMemorySize;
+
+ float weightSum = 0.0f;
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt)
+ {
+ uint32_t sharedMemorySize = (*cIt)->mSharedMemorySize;
+ uint32_t positionsSize = (*cIt)->mNumParticles * sizeof(PxVec4);
+
+ if(sharedMemorySize > sharedMemoryLimit)
+ break;
+
+ uint32_t numSharedPositions = PxMin(2u, (sharedMemoryLimit - sharedMemorySize) / positionsSize);
+
+ weightSum += weights[numSharedPositions] * positionsSize;
+ }
+ // tuning parameter: inverse performance for running i cloths per SM
+ weightSum *= 2.0f + i;
+
+ if(cIt == cEnd && weightSum > maxWeightSum)
+ {
+ maxWeightSum = weightSum;
+ numClothsPerSM = i;
+ }
+ }
+ PX_ASSERT(numClothsPerSM);
+
+ // update block size
+ uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31;
+
+ // Workaround for nvbug 1709919: theoretically, register usage should allow us to launch at least
+ // mFactory.mMaxThreadsPerBlock threads, because that value corresponds to __launch_bounds__(maxThreadsPerBlock).
+ CUdevice device = 0;
+ checkSuccess(cuCtxGetDevice(&device));
+ int registersPerBlock = 0, kernelRegisterCount = 0;
+ checkSuccess(cuDeviceGetAttribute(&registersPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
+ checkSuccess(cuFuncGetAttribute(&kernelRegisterCount, CU_FUNC_ATTRIBUTE_NUM_REGS, mKernelFunction));
+ numThreadsPerBlock = PxMin(numThreadsPerBlock, uint32_t(registersPerBlock / kernelRegisterCount));
+ PX_ASSERT(numThreadsPerBlock >= 192);
+
+ if(mFactory.mNumThreadsPerBlock != numThreadsPerBlock)
+ {
+ checkSuccess(
+ cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1));
+ }
+
+ // remember num cloths per SM in terms of max shared memory per block
+ mSharedMemoryLimit =
+ (mFactory.mContextManager->getSharedMemPerBlock() / numClothsPerSM) - mKernelSharedMemorySize;
+ }
+
+ uint32_t maxSharedMemorySize = 0;
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt)
+ {
+ CuCloth& cloth = **cIt;
+
+ uint32_t sharedMemorySize = cloth.mSharedMemorySize;
+ uint32_t positionsSize = cloth.mNumParticles * sizeof(PxVec4);
+
+ uint32_t numSharedPositions = PxMin(2u, (mSharedMemoryLimit - sharedMemorySize) / positionsSize);
+
+ maxSharedMemorySize = PxMax(maxSharedMemorySize, sharedMemorySize + numSharedPositions * positionsSize);
+
+ IterationStateFactory factory(cloth, mFrameDt);
+ IterationState<Simd4f> state = factory.create<Simd4f>(cloth);
+
+ mFrameData.pushBack(CuFrameData(cloth, numSharedPositions, state, mIterationDataBegin + mIterationData.size()));
+
+ while(state.mRemainingIterations)
+ {
+ mIterationData.pushBack(CuIterationData(state));
+ state.update();
+ }
+ }
+ mSharedMemorySize = maxSharedMemorySize;
+
+ // add dummy element because we read past the end
+ mIterationData.pushBack(CuIterationData());
+
+ if(&mIterationData.front() != iterationDataBegin)
+ {
+ // mIterationData grew, update pointers
+ iterationDataBegin = getDevicePointer(mIterationData);
+
+ ptrdiff_t diff = (char*)iterationDataBegin - (char*)mIterationDataBegin;
+ CuPinnedVector<CuFrameData>::Type::Iterator fIt = mFrameData.begin(), fEnd;
+ for(fEnd = mFrameData.end(); fIt != fEnd; ++fIt)
+ reinterpret_cast<const char*&>(fIt->mIterationData) += diff;
+
+ mIterationDataBegin = iterationDataBegin;
+ }
+}
+
+void cloth::CuSolver::executeKernel()
+{
+ CuContextLock contextLock(mFactory);
+
+#if ENABLE_CUDA_PRINTF
+ if(cudaError result = cudaPrintfInit(mKernelModule))
+ {
+ shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__, "cudaPrintfInit() returned %u.",
+ result);
+ }
+#endif
+
+ if(mClothDataDirty)
+ {
+ PX_ASSERT(mClothDataHostCopy.size() == mClothData.size());
+ size_t numBytes = mClothData.size() * sizeof(CuClothData);
+ checkSuccess(cuMemcpyHtoDAsync(mClothData.begin().dev(), mClothDataHostCopy.begin(), numBytes, mStream));
+ mClothDataDirty = false;
+ }
+
+#if 0
+ static int frame = 0;
+ if(++frame == 100)
+ record(*this);
+#endif
+
+ // launch kernel
+ CUresult result = cuLaunchKernel(mKernelFunction, mCloths.size(), 1, 1, mFactory.mNumThreadsPerBlock, 1, 1,
+ mSharedMemorySize, mStream, 0, 0);
+
+#if ENABLE_CUDA_PRINTF
+ cudaPrintfDisplay(mKernelModule);
+ cudaPrintfEnd();
+#endif
+
+#if PX_DEBUG
+ // in debug builds check kernel result
+ checkSuccess(result);
+ checkSuccess(cuStreamSynchronize(mStream));
+#endif
+
+ // mark the solver as being in an error state
+ // all cloth instances will be migrated to software
+ if(result != CUDA_SUCCESS)
+ mCudaError = true;
+}
+
+void cloth::CuSolver::endFrame()
+{
+ CuPinnedVector<CuFrameData>::Type::ConstIterator fIt = mFrameData.begin();
+ ClothVector::Iterator cIt, cEnd = mCloths.end();
+ for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++fIt)
+ {
+ CuCloth& cloth = **cIt;
+
+ cloth.mHostParticlesDirty = false;
+ cloth.mDeviceParticlesDirty = false;
+
+ cloth.mMotionConstraints.pop();
+ cloth.mMotionConstraints.mHostCopy.resize(0);
+
+ cloth.mSeparationConstraints.pop();
+ cloth.mSeparationConstraints.mHostCopy.resize(0);
+
+ if(!cloth.mTargetCollisionSpheres.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionSpheres, cloth.mTargetCollisionSpheres);
+ cloth.mTargetCollisionSpheres.resize(0);
+ }
+
+ if(!cloth.mTargetCollisionPlanes.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionPlanes, cloth.mTargetCollisionPlanes);
+ cloth.mTargetCollisionPlanes.resize(0);
+ }
+
+ if(!cloth.mTargetCollisionTriangles.empty())
+ {
+ shdfnd::swap(cloth.mStartCollisionTriangles, cloth.mTargetCollisionTriangles);
+ cloth.mTargetCollisionTriangles.resize(0);
+ }
+
+ for(uint32_t i = 0; i < 3; ++i)
+ {
+ float upper = fIt->mParticleBounds[i * 2 + 0];
+ float negativeLower = fIt->mParticleBounds[i * 2 + 1];
+ cloth.mParticleBoundsCenter[i] = (upper - negativeLower) * 0.5f;
+ cloth.mParticleBoundsHalfExtent[i] = (upper + negativeLower) * 0.5f;
+ }
+
+ cloth.mSleepPassCounter = fIt->mSleepPassCounter;
+ cloth.mSleepTestCounter = fIt->mSleepTestCounter;
+ }
+
+ interCollision();
+
+ PX_PROFILE_STOP_CROSSTHREAD("cloth::CuSolver::simulate", 0);
+}
+
+void cloth::CuSolver::interCollision()
+{
+ if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f)
+ return;
+
+ typedef SwInterCollision<Simd4f> SwInterCollision;
+
+ // rebuild cloth instance array
+ mInterCollisionInstances.resize(0);
+ for(uint32_t i = 0, n = mCloths.size(); i < n; ++i)
+ {
+ CuCloth& cloth = *mCloths[i];
+
+ float elasticity = 1.0f / mFrameData[i].mNumIterations;
+ PX_ASSERT(!cloth.mHostParticlesDirty);
+ PxVec4* particles = cloth.mParticlesHostCopy.begin();
+ uint32_t* indices = NULL, numIndices = cloth.mNumParticles;
+ if(!cloth.mSelfCollisionIndices.empty())
+ {
+ indices = cloth.mSelfCollisionIndicesHost.begin();
+ numIndices = uint32_t(cloth.mSelfCollisionIndices.size());
+ }
+
+ mInterCollisionInstances.pushBack(SwInterCollisionData(
+ particles, particles + cloth.mNumParticles, numIndices, indices, cloth.mTargetMotion,
+ cloth.mParticleBoundsCenter, cloth.mParticleBoundsHalfExtent, elasticity, cloth.mUserData));
+
+ cloth.mDeviceParticlesDirty = true;
+ }
+
+ uint32_t requiredTempMemorySize = uint32_t(
+ SwInterCollision::estimateTemporaryMemory(&mInterCollisionInstances[0], mInterCollisionInstances.size()));
+
+ // realloc temp memory if necessary
+ if(mInterCollisionScratchMemSize < requiredTempMemorySize)
+ {
+ if(mInterCollisionScratchMem)
+ PX_FREE(mInterCollisionScratchMem);
+
+ mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem");
+ mInterCollisionScratchMemSize = requiredTempMemorySize;
+ }
+
+ SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize);
+
+ // run inter-collision
+ SwInterCollision(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), mInterCollisionDistance,
+ mInterCollisionStiffness, mInterCollisionIterations, mInterCollisionFilter, allocator)();
+}
+
+cloth::CuSolver::ClothSolverTask::ClothSolverTask(FunctionPtr functionPtr, const char* name)
+: mSolver(0), mFunctionPtr(functionPtr), mName(name)
+{
+}
+
+void cloth::CuSolver::ClothSolverTask::runInternal()
+{
+ (mSolver->*mFunctionPtr)();
+}
+
+const char* cloth::CuSolver::ClothSolverTask::getName() const
+{
+ return mName;
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h
new file mode 100644
index 00000000..ff98d975
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h
@@ -0,0 +1,180 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Solver.h"
+#include "CuClothData.h"
+#include "CuPinnedAllocator.h"
+#include "CuContextLock.h"
+#include "CuDeviceVector.h"
+#include "CudaKernelWrangler.h"
+#include "CmTask.h"
+
+#include "SwInterCollision.h"
+
+namespace physx
+{
+
+namespace cloth
+{
+
+class CuCloth;
+class CuFabric;
+struct PhaseConfig;
+struct CuKernelData;
+
+class CuSolver : public UserAllocated, private CuContextLock, public Solver
+{
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4371) // layout of class may have changed from a previous version of the compiler due to
+ // better packing of member
+#endif
+ struct ClothSolverTask : public Cm::Task
+ {
+ typedef void (CuSolver::*FunctionPtr)();
+
+ ClothSolverTask(FunctionPtr, const char*);
+ virtual void runInternal();
+ virtual const char* getName() const;
+
+ CuSolver* mSolver;
+ FunctionPtr mFunctionPtr;
+ const char* mName;
+ };
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+ PX_NOCOPY(CuSolver)
+ public:
+ CuSolver(CuFactory&);
+ ~CuSolver();
+
+ virtual void addCloth(Cloth*);
+ virtual void removeCloth(Cloth*);
+
+ virtual physx::PxBaseTask& simulate(float dt, physx::PxBaseTask&);
+
+ virtual bool hasError() const
+ {
+ return mCudaError;
+ }
+
+ virtual void setInterCollisionDistance(float distance)
+ {
+ mInterCollisionDistance = distance;
+ }
+ virtual float getInterCollisionDistance() const
+ {
+ return mInterCollisionDistance;
+ }
+ virtual void setInterCollisionStiffness(float stiffness)
+ {
+ mInterCollisionStiffness = stiffness;
+ }
+ virtual float getInterCollisionStiffness() const
+ {
+ return mInterCollisionStiffness;
+ }
+ virtual void setInterCollisionNbIterations(uint32_t nbIterations)
+ {
+ mInterCollisionIterations = nbIterations;
+ }
+ virtual uint32_t getInterCollisionNbIterations() const
+ {
+ return mInterCollisionIterations;
+ }
+ virtual void setInterCollisionFilter(InterCollisionFilter filter)
+ {
+ mInterCollisionFilter = filter;
+ }
+
+ private:
+ void updateKernelData(); // context needs to be acquired
+
+ // simulate helper functions
+ void beginFrame();
+ void executeKernel();
+ void endFrame();
+
+ void interCollision();
+
+ physx::PxGpuDispatcher& getDispatcher() const;
+
+ private:
+ CuFactory& mFactory;
+
+ typedef Vector<CuCloth*>::Type ClothVector;
+ ClothVector mCloths;
+
+ CuDeviceVector<CuClothData> mClothData;
+ CuPinnedVector<CuClothData>::Type mClothDataHostCopy;
+ bool mClothDataDirty;
+
+ CuPinnedVector<CuFrameData>::Type mFrameData;
+
+ CuPinnedVector<CuIterationData>::Type mIterationData;
+ CuIterationData* mIterationDataBegin; // corresponding device ptr
+
+ float mFrameDt;
+
+ uint32_t mSharedMemorySize;
+ uint32_t mSharedMemoryLimit;
+
+ ClothSolverTask mStartSimulationTask;
+ ClothSolverTask mKernelSimulationTask;
+ ClothSolverTask mEndSimulationTask;
+
+ CUstream mStream;
+ CUmodule mKernelModule;
+ CUfunction mKernelFunction;
+ int mKernelSharedMemorySize;
+ CuDevicePointer<CuKernelData> mKernelData;
+ CuDevicePointer<uint32_t> mClothIndex;
+
+ float mInterCollisionDistance;
+ float mInterCollisionStiffness;
+ uint32_t mInterCollisionIterations;
+ InterCollisionFilter mInterCollisionFilter;
+ void* mInterCollisionScratchMem;
+ uint32_t mInterCollisionScratchMemSize;
+ shdfnd::Array<SwInterCollisionData> mInterCollisionInstances;
+
+ physx::KernelWrangler mKernelWrangler;
+
+ uint64_t mSimulateNvtxRangeId;
+
+ bool mCudaError;
+
+ friend void record(const CuSolver&);
+};
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h
new file mode 100644
index 00000000..d6ca350f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h
@@ -0,0 +1,57 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+namespace cloth
+{
+struct CuClothData;
+struct CuFrameData;
+
+// data of all cloth instances, one block per instance
+struct CuKernelData
+{
+ // pointer to atomic variable
+ uint32_t* mClothIndex;
+
+ // array of cloths (length determined by grid dim)
+ const CuClothData* mClothData;
+
+ // frame data per cloth
+ CuFrameData* mFrameData;
+};
+
+const char* getKernelDataName();
+const char* getKernelFunctionName();
+}
+}