aboutsummaryrefslogtreecommitdiff
path: root/NvCloth/src
diff options
context:
space:
mode:
Diffstat (limited to 'NvCloth/src')
-rw-r--r--NvCloth/src/BoundingBox.h40
-rw-r--r--NvCloth/src/ClothClone.h10
-rw-r--r--NvCloth/src/ClothImpl.h2
-rw-r--r--NvCloth/src/IterationState.h122
-rw-r--r--NvCloth/src/NvSimd/NvSimdTypes.h6
-rw-r--r--NvCloth/src/PointInterpolator.h32
-rw-r--r--NvCloth/src/SwCollision.cpp85
-rw-r--r--NvCloth/src/SwInterCollision.cpp1
-rw-r--r--NvCloth/src/SwSelfCollision.cpp78
-rw-r--r--NvCloth/src/SwSolver.cpp34
-rw-r--r--NvCloth/src/SwSolver.h5
-rw-r--r--NvCloth/src/SwSolverKernel.cpp8
-rw-r--r--NvCloth/src/cuda/CuCloth.cpp2
-rw-r--r--NvCloth/src/cuda/CuClothData.h2
-rw-r--r--NvCloth/src/cuda/CuFabric.cpp1
-rw-r--r--NvCloth/src/cuda/CuPinnedAllocator.h18
-rw-r--r--NvCloth/src/cuda/CuSolver.cpp52
-rw-r--r--NvCloth/src/cuda/CuSolver.h5
-rw-r--r--NvCloth/src/dx/DxBatchedVector.h2
-rw-r--r--NvCloth/src/dx/DxClothData.h2
-rw-r--r--NvCloth/src/dx/DxFactory.cpp42
-rw-r--r--NvCloth/src/dx/DxSolver.cpp68
-rw-r--r--NvCloth/src/dx/DxSolver.h5
-rw-r--r--NvCloth/src/neon/NeonSolverKernel.cpp12
-rw-r--r--NvCloth/src/scalar/SwCollisionHelpers.h3
-rw-r--r--NvCloth/src/sse2/SwCollisionHelpers.h3
26 files changed, 404 insertions, 236 deletions
diff --git a/NvCloth/src/BoundingBox.h b/NvCloth/src/BoundingBox.h
index 74bc0ff..ea84d52 100644
--- a/NvCloth/src/BoundingBox.h
+++ b/NvCloth/src/BoundingBox.h
@@ -37,26 +37,26 @@ namespace nv
namespace cloth
{
-template <typename Simd4f>
+template <typename T4f>
struct BoundingBox
{
- Simd4f mLower;
- Simd4f mUpper;
+ T4f mLower;
+ T4f mUpper;
};
-template <typename Simd4f>
-inline BoundingBox<Simd4f> loadBounds(const float* ptr)
+template <typename T4f>
+inline BoundingBox<T4f> loadBounds(const float* ptr)
{
- BoundingBox<Simd4f> result;
+ BoundingBox<T4f> result;
result.mLower = load(ptr);
result.mUpper = load(ptr + 3);
return result;
}
-template <typename Simd4f>
-inline BoundingBox<Simd4f> emptyBounds()
+template <typename T4f>
+inline BoundingBox<T4f> emptyBounds()
{
- BoundingBox<Simd4f> result;
+ BoundingBox<T4f> result;
result.mLower = gSimd4fFloatMax;
result.mUpper = -result.mLower;
@@ -64,10 +64,10 @@ inline BoundingBox<Simd4f> emptyBounds()
return result;
}
-template <typename Simd4f>
-inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd)
+template <typename T4f>
+inline BoundingBox<T4f> expandBounds(const BoundingBox<T4f>& bounds, const T4f* pIt, const T4f* pEnd)
{
- BoundingBox<Simd4f> result = bounds;
+ BoundingBox<T4f> result = bounds;
for (; pIt != pEnd; ++pIt)
{
result.mLower = min(result.mLower, *pIt);
@@ -76,26 +76,26 @@ inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const
return result;
}
-template <typename Simd4f>
-inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+template <typename T4f>
+inline BoundingBox<T4f> expandBounds(const BoundingBox<T4f>& a, const BoundingBox<T4f>& b)
{
- BoundingBox<Simd4f> result;
+ BoundingBox<T4f> result;
result.mLower = min(a.mLower, b.mLower);
result.mUpper = max(a.mUpper, b.mUpper);
return result;
}
-template <typename Simd4f>
-inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+template <typename T4f>
+inline BoundingBox<T4f> intersectBounds(const BoundingBox<T4f>& a, const BoundingBox<T4f>& b)
{
- BoundingBox<Simd4f> result;
+ BoundingBox<T4f> result;
result.mLower = max(a.mLower, b.mLower);
result.mUpper = min(a.mUpper, b.mUpper);
return result;
}
-template <typename Simd4f>
-inline bool isEmptyBounds(const BoundingBox<Simd4f>& a)
+template <typename T4f>
+inline bool isEmptyBounds(const BoundingBox<T4f>& a)
{
return anyGreater(a.mLower, a.mUpper) != 0;
}
diff --git a/NvCloth/src/ClothClone.h b/NvCloth/src/ClothClone.h
index 386fee6..7145da5 100644
--- a/NvCloth/src/ClothClone.h
+++ b/NvCloth/src/ClothClone.h
@@ -29,12 +29,12 @@
#pragma once
-#include "../SwFactory.h"
-#include "../SwFabric.h"
-#include "../SwCloth.h"
+#include "SwFactory.h"
+#include "SwFabric.h"
+#include "SwCloth.h"
-#include "../ClothImpl.h"
-#include "../ClothBase.h"
+#include "ClothImpl.h"
+#include "ClothBase.h"
#include "NvCloth/Allocator.h"
namespace nv
diff --git a/NvCloth/src/ClothImpl.h b/NvCloth/src/ClothImpl.h
index 4d7b28d..24f7732 100644
--- a/NvCloth/src/ClothImpl.h
+++ b/NvCloth/src/ClothImpl.h
@@ -1220,7 +1220,7 @@ inline float ClothImpl<T>::getLiftCoefficient() const
template <typename T>
inline void ClothImpl<T>::setFluidDensity(float fluidDensity)
{
- NV_CLOTH_ASSERT(fluidDensity < 0.f);
+ NV_CLOTH_ASSERT(fluidDensity > 0.f);
if (fluidDensity == mFluidDensity)
return;
diff --git a/NvCloth/src/IterationState.h b/NvCloth/src/IterationState.h
index 224e87e..e18b636 100644
--- a/NvCloth/src/IterationState.h
+++ b/NvCloth/src/IterationState.h
@@ -72,21 +72,21 @@ inline physx::PxQuat exp(const physx::PxVec3& v)
return physx::PxQuat(v.x * scale, v.y * scale, v.z * scale, physx::PxCos(theta));
}
-template <typename Simd4f, uint32_t N>
-inline void assign(Simd4f (&columns)[N], const physx::PxMat44& matrix)
+template <typename T4f, uint32_t N>
+inline void assign(T4f (&columns)[N], const physx::PxMat44& matrix)
{
for (uint32_t i = 0; i < N; ++i)
columns[i] = load(nv::cloth::array(matrix[i]));
}
-template <typename Simd4f>
-inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec)
+template <typename T4f>
+inline T4f transform(const T4f (&columns)[3], const T4f& vec)
{
return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
}
-template <typename Simd4f>
-inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec)
+template <typename T4f>
+inline T4f transform(const T4f (&columns)[3], const T4f& translate, const T4f& vec)
{
return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
}
@@ -99,17 +99,17 @@ struct IterationStateFactory
template <typename MyCloth>
IterationStateFactory(MyCloth& cloth, float frameDt);
- template <typename Simd4f, typename MyCloth>
- IterationState<Simd4f> create(MyCloth const& cloth) const;
+ template <typename T4f, typename MyCloth>
+ IterationState<T4f> create(MyCloth const& cloth) const;
- template <typename Simd4f>
- static Simd4f lengthSqr(Simd4f const& v)
+ template <typename T4f>
+ static T4f lengthSqr(T4f const& v)
{
return dot3(v, v);
}
- template <typename Simd4f>
- static physx::PxVec3 castToPxVec3(const Simd4f& v)
+ template <typename T4f>
+ static physx::PxVec3 castToPxVec3(const T4f& v)
{
return *reinterpret_cast<const physx::PxVec3*>(reinterpret_cast<const char*>(&v));
}
@@ -123,7 +123,7 @@ struct IterationStateFactory
};
/* solver iterations helper functor */
-template <typename Simd4f>
+template <typename T4f>
struct IterationState
{
// call after each iteration
@@ -133,15 +133,15 @@ struct IterationState
inline float getPreviousAlpha() const;
public:
- Simd4f mRotationMatrix[3]; // should rename to 'mRotation'
+ T4f mRotationMatrix[3]; // should rename to 'mRotation'
- Simd4f mCurBias; // in local space
- Simd4f mPrevBias; // in local space
- Simd4f mWind; // delta position per iteration (wind velocity * mIterDt)
+ T4f mCurBias; // in local space
+ T4f mPrevBias; // in local space
+ T4f mWind; // delta position per iteration (wind velocity * mIterDt)
- Simd4f mPrevMatrix[3];
- Simd4f mCurMatrix[3];
- Simd4f mDampScaleUpdate;
+ T4f mPrevMatrix[3];
+ T4f mCurMatrix[3];
+ T4f mDampScaleUpdate;
// iteration counter
uint32_t mRemainingIterations;
@@ -157,14 +157,14 @@ struct IterationState
} // namespace cloth
-template <typename Simd4f>
-inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const
+template <typename T4f>
+inline float cloth::IterationState<T4f>::getCurrentAlpha() const
{
return getPreviousAlpha() + mInvNumIterations;
}
-template <typename Simd4f>
-inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const
+template <typename T4f>
+inline float cloth::IterationState<T4f>::getPreviousAlpha() const
{
return 1.0f - mRemainingIterations * mInvNumIterations;
}
@@ -232,36 +232,36 @@ If you change anything in this function, make sure that ClothCustomFloating and
ClothInertia haven't regressed for any choice of solver frequency.
*/
-template <typename Simd4f, typename MyCloth>
-cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const
+template <typename T4f, typename MyCloth>
+cloth::IterationState<T4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const
{
- IterationState<Simd4f> result;
+ IterationState<T4f> result;
result.mRemainingIterations = static_cast<uint32_t>(mNumIterations);
result.mInvNumIterations = mInvNumIterations;
result.mIterDt = mIterDt;
- Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity));
- Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity));
+ T4f curLinearVelocity = load(array(cloth.mLinearVelocity));
+ T4f prevLinearVelocity = load(array(mPrevLinearVelocity));
- Simd4f iterDt = simd4f(mIterDt);
- Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt;
+ T4f iterDt = simd4f(mIterDt);
+ T4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt;
- Simd4f translation = iterDt * curLinearVelocity;
+ T4f translation = iterDt * curLinearVelocity;
// gravity delta per iteration
- Simd4f gravity = load(array(cloth.mGravity)) * static_cast<Simd4f>(simd4f(sqr(mIterDtAverage)));
+ T4f gravity = load(array(cloth.mGravity)) * static_cast<T4f>(simd4f(sqr(mIterDtAverage)));
// scale of local particle velocity per iteration
- Simd4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent);
+ T4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent);
// adjust for the change in time step during the first iteration
- Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio);
+ T4f firstDampScale = dampScale * simd4f(mIterDtRatio);
// portion of negative frame velocity to transfer to particle
- Simd4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation;
+ T4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation;
// portion of frame acceleration to transfer to particle
- Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity);
+ T4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity);
// for inertia, we want to violate newton physics to
// match velocity and position as given by the user, which means:
@@ -271,13 +271,13 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
// specifically, the portion is alpha=(n+1)/2n and 1-alpha.
float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations;
- Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha);
+ T4f curLinearInertia = linearInertia * simd4f(linearAlpha);
// rotate to local space (use mRotationMatrix temporarily to hold matrix)
physx::PxMat44 invRotation = physx::PxMat44(mCurrentRotation.getConjugate());
assign(result.mRotationMatrix, invRotation);
- Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+ T4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
// Previously, we split the bias between previous and current position to
// get correct disretized position and velocity. However, this made a
@@ -286,23 +286,23 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
// timesteps. Instead, we now apply the entire bias to current position
// and accept a less noticeable error for a free falling cloth.
- Simd4f bias = gravity - linearDrag;
+ T4f bias = gravity - linearDrag;
result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ;
result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ;
- Simd4f wind = load(array(cloth.mWind)) * iterDt; // multiply with delta time here already so we don't have to do it inside the solver
+ T4f wind = load(array(cloth.mWind)) * iterDt; // multiply with delta time here already so we don't have to do it inside the solver
result.mWind = transform(result.mRotationMatrix, translation - wind) & maskXYZ;
result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f;
if (result.mIsTurning)
{
- Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity)));
- Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity)));
+ T4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity)));
+ T4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity)));
// rotation for one iteration in local space
- Simd4f curInvAngle = -iterDt * curAngularVelocity;
- Simd4f prevInvAngle = -iterDt * prevAngularVelocity;
+ T4f curInvAngle = -iterDt * curAngularVelocity;
+ T4f prevInvAngle = -iterDt * prevAngularVelocity;
physx::PxQuat curInvRotation = exp(castToPxVec3(curInvAngle));
physx::PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle));
@@ -312,17 +312,17 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
assign(result.mRotationMatrix, curMatrix);
- Simd4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent);
- Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia));
- Simd4f angularInertia = load(array(cloth.mAngularInertia));
- Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity;
+ T4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent);
+ T4f centrifugalInertia = load(array(cloth.mCentrifugalInertia));
+ T4f angularInertia = load(array(cloth.mAngularInertia));
+ T4f angularAcceleration = curAngularVelocity - prevAngularVelocity;
- Simd4f epsilon = simd4f(sqrtf(FLT_MIN)); // requirement: sqr(epsilon) > 0
- Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon;
- Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon;
- Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon;
- Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon;
- Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon;
+ T4f epsilon = simd4f(sqrtf(FLT_MIN)); // requirement: sqr(epsilon) > 0
+ T4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon;
+ T4f dragLengthSqr = lengthSqr(T4f(curAngularVelocity * angularDrag)) + epsilon;
+ T4f centrifugalLengthSqr = lengthSqr(T4f(curAngularVelocity * centrifugalInertia)) + epsilon;
+ T4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon;
+ T4f inertiaLengthSqr = lengthSqr(T4f(angularAcceleration * angularInertia)) + epsilon;
float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0];
float inertiaScale =
@@ -337,11 +337,11 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
inertiaScale;
// slightly better in ClothCustomFloating than curInvAngle alone
- Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f);
- const Simd4f data = lengthSqr(centrifugalVelocity);
+ T4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f);
+ const T4f data = lengthSqr(centrifugalVelocity);
float centrifugalSqrLength = array(data)[0] * centrifugalScale;
- Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale);
+ T4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale);
physx::PxMat33 coriolisMatrix = physx::shdfnd::star(castToPxVec3(coriolisVelocity));
const float* dampScalePtr = array(firstDampScale);
@@ -369,7 +369,7 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
}
else
{
- Simd4f minusOne = -static_cast<Simd4f>(gSimd4fOne);
+ T4f minusOne = -static_cast<T4f>(gSimd4fOne);
result.mRotationMatrix[0] = minusOne;
result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne);
}
@@ -380,8 +380,8 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
return result;
}
-template <typename Simd4f>
-void cloth::IterationState<Simd4f>::update()
+template <typename T4f>
+void cloth::IterationState<T4f>::update()
{
if (mIsTurning)
{
diff --git a/NvCloth/src/NvSimd/NvSimdTypes.h b/NvCloth/src/NvSimd/NvSimdTypes.h
index 0625332..dd94b40 100644
--- a/NvCloth/src/NvSimd/NvSimdTypes.h
+++ b/NvCloth/src/NvSimd/NvSimdTypes.h
@@ -104,7 +104,7 @@ void foo(const float* ptr)
#define NV_SIMD_INLINE_ASSEMBLER 1
#endif
-/*! \def NV_SIMD_USE_NAMESPACE
+/*! \def NV_CLOTH_NO_SIMD_NAMESPACE
* \brief Set to 1 to define the SIMD library types and functions inside the nvidia::simd namespace.
* By default, the types and functions defined in this header live in the global namespace.
* This is because MSVC (prior to version 12, Visual Studio 2013) does an inferior job at optimizing
@@ -116,11 +116,11 @@ void foo(const float* ptr)
* __m128i are wrapped into structs. Arguments need to be passed by reference in this mode.
* \see NV_SIMD_VECTORCALL, Simd4fArg */
-#if defined NV_SIMD_USE_NAMESPACE&& NV_SIMD_USE_NAMESPACE
+#ifndef NV_CLOTH_NO_SIMD_NAMESPACE
#define NV_SIMD_NAMESPACE_BEGIN \
namespace nv \
{ \
- namespace simd \
+ namespace cloth \
{
#define NV_SIMD_NAMESPACE_END \
} \
diff --git a/NvCloth/src/PointInterpolator.h b/NvCloth/src/PointInterpolator.h
index b9db131..75e1dcf 100644
--- a/NvCloth/src/PointInterpolator.h
+++ b/NvCloth/src/PointInterpolator.h
@@ -37,7 +37,7 @@ namespace cloth
{
// acts as a poor mans random access iterator
-template <typename Simd4f, typename BaseIterator>
+template <typename T4f, typename BaseIterator>
class LerpIterator
{
@@ -50,12 +50,12 @@ class LerpIterator
}
// return the interpolated point at a given index
- inline Simd4f operator[](size_t index) const
+ inline T4f operator[](size_t index) const
{
return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha;
}
- inline Simd4f operator*() const
+ inline T4f operator*() const
{
return (*this)[0];
}
@@ -70,13 +70,13 @@ class LerpIterator
private:
// interpolation parameter
- const Simd4f mAlpha;
+ const T4f mAlpha;
BaseIterator mStart;
BaseIterator mTarget;
};
-template <typename Simd4f, size_t Stride>
+template <typename T4f, size_t Stride>
class UnalignedIterator
{
@@ -87,12 +87,12 @@ class UnalignedIterator
{
}
- inline Simd4f operator[](size_t index) const
+ inline T4f operator[](size_t index) const
{
return load(mPointer + index * Stride);
}
- inline Simd4f operator*() const
+ inline T4f operator*() const
{
return (*this)[0];
}
@@ -109,15 +109,15 @@ class UnalignedIterator
};
// acts as an iterator but returns a constant
-template <typename Simd4f>
+template <typename T4f>
class ConstantIterator
{
public:
- ConstantIterator(const Simd4f& value) : mValue(value)
+ ConstantIterator(const T4f& value) : mValue(value)
{
}
- inline Simd4f operator*() const
+ inline T4f operator*() const
{
return mValue;
}
@@ -129,20 +129,20 @@ class ConstantIterator
private:
ConstantIterator& operator = (const ConstantIterator&);
- const Simd4f mValue;
+ const T4f mValue;
};
// wraps an iterator with constant scale and bias
-template <typename Simd4f, typename BaseIterator>
+template <typename T4f, typename BaseIterator>
class ScaleBiasIterator
{
public:
- ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias)
+ ScaleBiasIterator(BaseIterator base, const T4f& scale, const T4f& bias)
: mScale(scale), mBias(bias), mBaseIterator(base)
{
}
- inline Simd4f operator*() const
+ inline T4f operator*() const
{
return (*mBaseIterator) * mScale + mBias;
}
@@ -156,8 +156,8 @@ class ScaleBiasIterator
private:
ScaleBiasIterator& operator = (const ScaleBiasIterator&);
- const Simd4f mScale;
- const Simd4f mBias;
+ const T4f mScale;
+ const T4f mBias;
BaseIterator mBaseIterator;
};
diff --git a/NvCloth/src/SwCollision.cpp b/NvCloth/src/SwCollision.cpp
index 89df8a5..0aa196d 100644
--- a/NvCloth/src/SwCollision.cpp
+++ b/NvCloth/src/SwCollision.cpp
@@ -40,6 +40,7 @@
using namespace nv;
using namespace physx;
+using namespace cloth;
// the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision
template <typename T4f>
@@ -160,31 +161,41 @@ void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres,
cloth::ConeData* cIt = dst;
for (const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt)
{
+ // w element contains sphere radii
PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]);
PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]);
PxVec4 center = (second + first) * 0.5f;
- PxVec4 axis = (second - first) * 0.5f;
+ PxVec4 axis = (second - first) * 0.5f; //half axis
+ //axiw.w = half of radii difference
- float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
- float sqrConeLength = sqrAxisLength - cloth::sqr(axis.w);
+ // |Axis|^2
+ float sqrAxisHalfLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
- float invAxisLength = 1 / sqrtf(sqrAxisLength);
- float invConeLength = 1 / sqrtf(sqrConeLength);
+ // http://jwilson.coe.uga.edu/emt669/Student.Folders/Kertscher.Jeff/Essay.3/Tangents.html
+ // |Axis|^2 = |Cone|^2 + (sphere2Radius-sphere1Radius)^2
+ float sqrConeHalfLength = sqrAxisHalfLength - cloth::sqr(axis.w);
- if (sqrConeLength <= 0.0f)
- invAxisLength = invConeLength = 0.0f;
+ float invAxisHalfLength = 1 / sqrtf(sqrAxisHalfLength);
+ float invConeHalfLength = 1 / sqrtf(sqrConeHalfLength);
- float axisLength = sqrAxisLength * invAxisLength;
- float slope = axis.w * invConeLength;
+ if (sqrConeHalfLength <= 0.0f)
+ invAxisHalfLength = invConeHalfLength = 0.0f;
+
+ float axisHalfLength = sqrAxisHalfLength * invAxisHalfLength;
+ float slope = axis.w * invConeHalfLength;
cIt->center = PxVec3(center.x, center.y, center.z );
- cIt->radius = (axis.w + first.w) * invConeLength * axisLength;
- cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength;
+ cIt->radius = (axis.w + first.w) * invConeHalfLength * axisHalfLength; //cone radius in the center
+ cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisHalfLength;
cIt->slope = slope;
- cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisLength);
- cIt->halfLength = axisLength;
+ // cos()^2 = 1.0 - (radius difference / axis length)^2
+ // cos()^2 = 1.0 - (opposite/hypotenuse)^2
+ // cos()^2 = 1.0 - sin(angle between c2c1 and c2t1)^2
+ // cos()^2 = 1.0 - sin(angle between axis and c2t1)^2
+ cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisHalfLength);
+ cIt->halfLength = axisHalfLength;
uint32_t firstMask = 0x1u << iIt->first;
cIt->firstMask = firstMask;
@@ -407,12 +418,14 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt)
{
static const int maxIndex = sGridSize - 1;
+ uint32_t mask = 0x1; //single bit mask for current sphere
const SphereData* sEnd = sIt + mClothData.mNumSpheres;
- for (uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1)
+ for (; sIt != sEnd; ++sIt, mask <<= 1)
{
T4f sphere = loadAligned(array(sIt->center));
T4f radius = splat<3>(sphere);
+ //calculate the first and last cell index, for each axis, that contains the sphere
T4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero));
T4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
@@ -422,11 +435,14 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt)
uint32_t* firstIt = reinterpret_cast<uint32_t*>(mSphereGrid);
uint32_t* lastIt = firstIt + 3 * sGridSize;
+ //loop through the 3 axes
for (uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize)
{
+ //mark the sphere and everything to the right
for (int j = firstIdx[i]; j <= maxIndex; ++j)
firstIt[j] |= mask;
+ //mark the sphere and everything to the left
for (int j = lastIdx[i]; j >= 0; --j)
lastIt[j] |= mask;
}
@@ -469,17 +485,23 @@ void cloth::SwCollision<T4f>::mergeAcceleration(uint32_t* firstIt)
template <typename T4f>
bool cloth::SwCollision<T4f>::buildAcceleration()
{
- // determine sphere bbox
+ // determine single bounding box around all spheres
BoundingBox<T4f> sphereBounds =
expandBounds(emptyBounds<T4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres);
+
+ // determine single bounding box around all particles
BoundingBox<T4f> particleBounds = loadBounds<T4f>(mClothData.mCurBounds);
+
if (mClothData.mEnableContinuousCollision)
{
+ // extend bounds to include movement from previous frame
sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres);
particleBounds = expandBounds(particleBounds, loadBounds<T4f>(mClothData.mPrevBounds));
}
BoundingBox<T4f> bounds = intersectBounds(sphereBounds, particleBounds);
+
+ // no collision checks needed if the intersection between particle bounds and sphere bounds is empty
T4f edgeLength = (bounds.mUpper - bounds.mLower) & ~static_cast<T4f>(sMaskW);
if (!allGreaterEqual(edgeLength, gSimd4fZero))
return false;
@@ -490,6 +512,7 @@ bool cloth::SwCollision<T4f>::buildAcceleration()
const T4f expandedEdgeLength = max(expandedUpper - expandedLower, gSimd4fEpsilon);
// make grid minimal thickness and strict upper bound of spheres
+ // grid maps bounds to 0-7 space (sGridLength =~= 8)
mGridScale = sGridLength * recip<1>(expandedEdgeLength);
mGridBias = -expandedLower * mGridScale;
array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles()
@@ -655,8 +678,8 @@ struct cloth::SwCollision<T4f>::ImpulseAccumulator
mNumCollisions = mNumCollisions + (gSimd4fOne & mask);
}
- T4f mDeltaX, mDeltaY, mDeltaZ;
- T4f mVelX, mVelY, mVelZ;
+ T4f mDeltaX, mDeltaY, mDeltaZ; //depenetration delta
+ T4f mVelX, mVelY, mVelZ; //frame offset of the collision shape (velocity * dt)
T4f mNumCollisions;
};
@@ -684,12 +707,15 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
T4f sqrDistance = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
T4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * splat<3>(sphere);
+ // negativeScale = 1 - radius/|position-sphere|
T4f contactMask;
if (!anyGreater(gSimd4fZero, negativeScale, contactMask))
continue;
accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask);
+ // -= delta * negativeScale
+ // = delta - delta * radius/|position-sphere|
if (frictionEnabled)
{
@@ -730,10 +756,13 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
T4f center = loadAligned(centerPtr, offset);
+ // offset from center of cone to particle
+ // delta = pos - center
T4f deltaX = positions[0] - splat<0>(center);
T4f deltaY = positions[1] - splat<1>(center);
T4f deltaZ = positions[2] - splat<2>(center);
+ //axis of the cone
T4f axis = loadAligned(axisPtr, offset);
T4f axisX = splat<0>(axis);
@@ -741,12 +770,16 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
T4f axisZ = splat<2>(axis);
T4f slope = splat<3>(axis);
+ // project delta onto axis
T4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+ // interpolate radius
T4f radius = dot * slope + splat<3>(center);
// set radius to zero if cone is culled
radius = max(radius, gSimd4fZero) & ~culled;
+ // distance to axis
+ // sqrDistance = |delta|^2 - |dot|^2
T4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
T4i auxiliary = loadAligned(auxiliaryPtr, offset);
@@ -765,6 +798,8 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
sqrDistance = max(sqrDistance, gSimd4fEpsilon);
T4f invDistance = rsqrt(sqrDistance);
+
+ //offset base to take slope in to account
T4f base = dot + slope * sqrDistance * invDistance;
// force left/rightMask to false if not inside cone
@@ -780,6 +815,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask);
shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask);
+ //contact normal direction
deltaX = deltaX - base * axisX;
deltaY = deltaY - base * axisY;
deltaZ = deltaZ - base * axisZ;
@@ -1173,7 +1209,9 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
T4f ny = deltaY * rcpDelta;
T4f nz = deltaZ * rcpDelta;
- // calculate relative velocity scaled by number of collisions
+ // calculate relative velocity
+ // velXYZ is scaled by one over the number of collisions since all collisions accumulate into
+ // that variable during collision detection
T4f rvx = curPos[0] - prevPos[0] - velX * scale;
T4f rvy = curPos[1] - prevPos[1] - velY * scale;
T4f rvz = curPos[2] - prevPos[2] - velZ * scale;
@@ -1186,7 +1224,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
T4f rvty = rvy - rvn * ny;
T4f rvtz = rvz - rvn * nz;
- // calculate magnitude of vt
+ // calculate magnitude of relative tangential velocity
T4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon);
// magnitude of friction impulse (cannot be greater than -vt)
@@ -1206,7 +1244,7 @@ void cloth::SwCollision<T4f>::collideParticles()
const T4f massScale = simd4f(mClothData.mCollisionMassScale);
const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
- const T4f frictionScale = simd4f(mClothData.mFrictionScale);
+ const T4f frictionScale = simd4f(mClothData.mFrictionScale); //[arameter set by user
T4f curPos[4];
T4f prevPos[4];
@@ -1214,16 +1252,20 @@ void cloth::SwCollision<T4f>::collideParticles()
float* __restrict prevIt = mClothData.mPrevParticles;
float* __restrict pIt = mClothData.mCurParticles;
float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+ //loop over particles 4 at a time
for (; pIt < pEnd; pIt += 16, prevIt += 16)
{
curPos[0] = loadAligned(pIt, 0);
curPos[1] = loadAligned(pIt, 16);
curPos[2] = loadAligned(pIt, 32);
curPos[3] = loadAligned(pIt, 48);
- transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+ transpose(curPos[0], curPos[1], curPos[2], curPos[3]); //group values by axis in simd structure
ImpulseAccumulator accum;
+
+ //first collide cones
T4i sphereMask = collideCones(curPos, accum);
+ //pass on hit mask to ignore sphere parts that are inside the cones
collideSpheres(sphereMask, curPos, accum);
T4f mask;
@@ -1267,6 +1309,7 @@ void cloth::SwCollision<T4f>::collideParticles()
curPos[3] = select(mask, curPos[3] * scale, curPos[3]);
}
+ //apply average de-penetration delta
curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp
index b9b494f..50be414 100644
--- a/NvCloth/src/SwInterCollision.cpp
+++ b/NvCloth/src/SwInterCollision.cpp
@@ -39,6 +39,7 @@
using namespace nv;
using namespace physx;
+using namespace cloth;
namespace
{
diff --git a/NvCloth/src/SwSelfCollision.cpp b/NvCloth/src/SwSelfCollision.cpp
index 095943d..ec5a166 100644
--- a/NvCloth/src/SwSelfCollision.cpp
+++ b/NvCloth/src/SwSelfCollision.cpp
@@ -37,23 +37,24 @@
#endif
using namespace nv;
+using namespace cloth;
namespace
{
-const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
-
// returns sorted indices, output needs to be at least 2*(last - first) + 1024
void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
{
+ // this sort uses a radix (bin) size of 256, requiring 4 bins to sort the 32 bit keys
uint16_t n = uint16_t(last - first);
uint16_t* buffer = out + 2 * n;
uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
+ //zero the buffer memory used for the 4 buckets
memset(buffer, 0, 1024 * sizeof(uint16_t));
- // build 3 histograms in one pass
+ // build 4 histograms in one pass
for (const uint32_t* __restrict it = first; it != last; ++it)
{
uint32_t key = *it;
@@ -64,7 +65,7 @@ void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
}
// convert histograms to offset tables in-place
- uint16_t sums[4] = {};
+ uint16_t sums[4] = {0, 0, 0, 0};
for (uint32_t i = 0; i < 256; ++i)
{
uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]);
@@ -133,6 +134,7 @@ bool isSelfCollisionEnabled(const cloth::SwCloth& cloth)
return std::min(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f;
}
+// align x to a 2 byte boundary
inline uint32_t align2(uint32_t x)
{
return (x + 1) & ~1;
@@ -146,7 +148,7 @@ cloth::SwSelfCollision<T4f>::SwSelfCollision(cloth::SwClothData& clothData, clot
{
mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance);
mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
- mStiffness = sMaskXYZ & static_cast<T4f>(simd4f(mClothData.mSelfCollisionStiffness));
+ mStiffness = gSimd4fMaskXYZ & static_cast<T4f>(simd4f(mClothData.mSelfCollisionStiffness));
}
template <typename T4f>
@@ -170,11 +172,12 @@ void cloth::SwSelfCollision<T4f>::operator()()
uint32_t hashAxis0 = (sweepAxis + 1) % 3;
uint32_t hashAxis1 = (sweepAxis + 2) % 3;
- // reserve 0, 127, and 65535 for sentinel
+ // reserve 0, 255, and 65535 for sentinel
T4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
T4f one = gSimd4fOne;
+ // +1 for sentinel 0 offset
T4f gridSize = simd4f(254.0f);
array(gridSize)[sweepAxis] = 65534.0f;
@@ -194,6 +197,7 @@ void cloth::SwSelfCollision<T4f>::operator()()
// create keys
for (uint32_t i = 0; i < numIndices; ++i)
{
+ // use all particles when no self collision indices are set
uint32_t index = indices ? indices[i] : i;
// grid coordinate
@@ -207,28 +211,32 @@ void cloth::SwSelfCollision<T4f>::operator()()
keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
}
- // compute sorted keys indices
+ // compute sorted key indices
radixSort(keys, keys + numIndices, sortedIndices);
// snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
- uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769];
+ // sortedIndices[2 * numIndices + 768 + 1] is actually histograms[3]+1 from radixSort
+ uint16_t firstColumnSize = sortedIndices[2 * numIndices + 768 + 1];
- // sort keys
+ // sort keys using the sortedIndices
for (uint32_t i = 0; i < numIndices; ++i)
sortedKeys[i] = keys[sortedIndices[i]];
sortedKeys[numIndices] = uint32_t(-1); // sentinel
+ // do user provided index array indirection here if we have one
+ // so we don't need to keep branching for this condition later
if (indices)
{
// sort indices (into no-longer-needed keys array)
- const uint16_t* __restrict permutation = sortedIndices;
+ // the keys array is no longer used so we can reuse it to store indices[sortedIndices[i]]
+ const uint16_t* __restrict oldSortedIndices = sortedIndices;
sortedIndices = reinterpret_cast<uint16_t*>(keys);
for (uint32_t i = 0; i < numIndices; ++i)
- sortedIndices[i] = uint16_t(indices[permutation[i]]);
+ sortedIndices[i] = uint16_t(indices[oldSortedIndices[i]]);
}
// calculate the number of buckets we need to search forward
- const Simd4i data = intFloor(gridScale * mCollisionDistance);
+ const Simd4i data = intFloor(gridScale * mCollisionDistance); //equal to or larger than floor(mCollisionDistance)
uint32_t collisionDistance = 2 + static_cast<uint32_t>(array(data)[sweepAxis]);
// collide particles
@@ -310,7 +318,7 @@ void cloth::SwSelfCollision<T4f>::collideParticles(T4f& pos0, T4f& pos1, const T
T4f ratio = mCollisionDistance * rsqrt(distSqr);
T4f scale = mStiffness * recip(gSimd4fEpsilon + w0 + w1);
- T4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+ T4f delta = (scale * (diff - diff * ratio)) & gSimd4fMaskXYZ;
pos0 = pos0 + delta * w0;
pos1 = pos1 - delta * w1;
@@ -325,42 +333,71 @@ template <bool useRestParticles>
void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize,
const uint16_t* indices, uint32_t collisionDistance)
{
+ //keys is an array of bucket keys for the particles
+ //indices is an array of particle indices
+ //collisionDistance is the number of buckets along the sweep axis we need to search after the current one
+
T4f* __restrict particles = reinterpret_cast<T4f*>(mClothData.mCurParticles);
T4f* __restrict restParticles =
useRestParticles ? reinterpret_cast<T4f*>(mClothData.mRestPositions) : particles;
- const uint32_t bucketMask = uint16_t(-1);
+ //16 lsb's are for the bucket
+ const uint32_t bucketMask = 0x0000ffff;
+ // offsets for cells (not along the sweep axis)
+ // [1] [3]-[1] [3] [1]+[3]
const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
const uint32_t* __restrict kFirst[5];
const uint32_t* __restrict kLast[5];
+ /*
+ We use 5 first/last pairs to search the following cells
+ =====================
+ | | | | | |
+ =====================
+ | | | 0 | 1 | |
+ =====================
+ | | 2 | 3 | 4 | |
+ =====================
+ | | | | | |
+ =====================
+ With 0 as the origin.
+ This way collisions won't be double reported.
+ */
+
{
// optimization: scan forward iterator starting points once instead of 9 times
const uint32_t* __restrict kIt = keys;
uint32_t key = *kIt;
+ //clamp first/lastKey to bucket
uint32_t firstKey = key - std::min(collisionDistance, key & bucketMask);
uint32_t lastKey = std::min(key + collisionDistance, key | bucketMask);
+ //sweep 0
kFirst[0] = kIt;
+ //find next key in keys that is past lastKey
while (*kIt < lastKey)
++kIt;
kLast[0] = kIt;
+ //sweep 1...4
for (uint32_t k = 1; k < 5; ++k)
{
+ // scan forward start point
for (uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
++kIt;
kFirst[k] = kIt;
+ // scan forward end point
for (uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
++kIt;
kLast[k] = kIt;
- // jump forward once to second column
- kIt = keys + firstColumnSize;
+ // jump forward once to second column to go from cell offset 1 to 2 quickly
+ if(firstColumnSize)
+ kIt = keys + firstColumnSize;
firstColumnSize = 0;
}
}
@@ -371,7 +408,8 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_
const uint16_t* __restrict jIt;
const uint16_t* __restrict jEnd;
- for (; iIt != iEnd; ++iIt, ++kFirst[0])
+ //loop through all indices
+ for (; iIt < iEnd; ++iIt, ++kFirst[0])
{
NV_CLOTH_ASSERT(*iIt < mClothData.mNumParticles);
@@ -390,8 +428,8 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_
++kLast[0];
// process potential colliders of same cell
- jEnd = indices + (kLast[0] - keys);
- for (jIt = iIt + 1; jIt != jEnd; ++jIt)
+ jEnd = indices + (kLast[0] - keys); //calculate index from key pointer
+ for (jIt = iIt + 1; jIt < jEnd; ++jIt)
collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
// process neighbor cells
@@ -407,7 +445,7 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_
// process potential colliders
jEnd = indices + (kLast[k] - keys);
- for (jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+ for (jIt = indices + (kFirst[k] - keys); jIt < jEnd; ++jIt)
collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
}
diff --git a/NvCloth/src/SwSolver.cpp b/NvCloth/src/SwSolver.cpp
index c7437e1..f0f9152 100644
--- a/NvCloth/src/SwSolver.cpp
+++ b/NvCloth/src/SwSolver.cpp
@@ -50,7 +50,7 @@ bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, Iteratio
}
using namespace nv;
-
+using namespace cloth;
#if NV_SIMD_SIMD
typedef Simd4f Simd4fType;
#else
@@ -93,12 +93,17 @@ void sortTasks(shdfnd::Array<T, cloth::NonTrackingAllocator>& tasks)
void cloth::SwSolver::addCloth(Cloth* cloth)
{
- SwCloth& swCloth = *static_cast<SwCloth*>(cloth);
-
- mSimulatedCloths.pushBack(SimulatedCloth(swCloth, this));
+ addClothAppend(cloth);
sortTasks(mSimulatedCloths);
+}
- mCloths.pushBack(&swCloth);
+void cloth::SwSolver::addCloths(Range<Cloth*> cloths)
+{
+ for (uint32_t i = 0; i < cloths.size(); ++i)
+ {
+ addClothAppend(*(cloths.begin() + i));
+ }
+ sortTasks(mSimulatedCloths);
}
void cloth::SwSolver::removeCloth(Cloth* cloth)
@@ -221,6 +226,16 @@ void cloth::SwSolver::interCollision()
collider();
}
+void cloth::SwSolver::addClothAppend(Cloth* cloth)
+{
+ SwCloth& swCloth = *static_cast<SwCloth*>(cloth);
+ NV_CLOTH_ASSERT(mCloths.find(&swCloth) == mCloths.end());
+
+ mSimulatedCloths.pushBack(SimulatedCloth(swCloth, this));
+
+ mCloths.pushBack(&swCloth);
+}
+
void cloth::SwSolver::beginFrame() const
{
mSimulateProfileEventData = NV_CLOTH_PROFILE_START_CROSSTHREAD("cloth::SwSolver::simulate", 0);
@@ -287,9 +302,14 @@ void cloth::SwSolver::SimulatedCloth::Simulate()
// construct kernel functor and execute
#if NV_ANDROID
- // if (!neonSolverKernel(cloth, data, allocator, factory))
-#endif
+ if (!neonSolverKernel(*mCloth, data, allocator, factory))
+ {
+ //NV_CLOTH_LOG_WARNING("No NEON CPU support detected. Falling back to scalar types.");
+ SwSolverKernel<Scalar4f>(*mCloth, data, allocator, factory)();
+ }
+#else
SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory)();
+#endif
data.reconcile(*mCloth); // update cloth
}
diff --git a/NvCloth/src/SwSolver.h b/NvCloth/src/SwSolver.h
index c7b177b..ad58a7c 100644
--- a/NvCloth/src/SwSolver.h
+++ b/NvCloth/src/SwSolver.h
@@ -64,6 +64,7 @@ class SwSolver : public Solver
virtual ~SwSolver();
virtual void addCloth(Cloth*) override;
+ virtual void addCloths(Range<Cloth*> cloths) override;
virtual void removeCloth(Cloth*) override;
virtual int getNumCloths() const override;
virtual Cloth * const * getClothList() const override;
@@ -112,6 +113,10 @@ class SwSolver : public Solver
}
private:
+ // add cloth helper functions
+ void addClothAppend(Cloth* cloth);
+
+ // simulate helper functions
void beginFrame() const;
void endFrame() const;
diff --git a/NvCloth/src/SwSolverKernel.cpp b/NvCloth/src/SwSolverKernel.cpp
index eec7956..2181b1e 100644
--- a/NvCloth/src/SwSolverKernel.cpp
+++ b/NvCloth/src/SwSolverKernel.cpp
@@ -103,6 +103,7 @@ const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA
#endif
using namespace nv;
+using namespace cloth;
namespace
{
@@ -209,11 +210,14 @@ void constrainMotion(T4f* __restrict curIt, const T4f* __restrict curEnd, const
T4f isPositive;
if (anyGreater(slack, gSimd4fZero, isPositive))
{
- // set invMass to zero if radius is zero
+ // set invMass to zero if radius is zero (xyz will be unchanged)
+ // curPos.w = radius > 0 ? curPos.w : 0
+ // the first three components are compared against -FLT_MAX which is always true
curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ);
curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ);
curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ);
curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ);
+ // we don't have to splat the last one as the 4th element is already in the right place
slack = slack * stiffness & isPositive;
@@ -367,7 +371,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons
}
}
-#if PX_WINDOWS_FAMILY
+#if PX_WINDOWS_FAMILY && NV_SIMD_SSE2
#include "sse2/SwSolveConstraints.h"
#endif
diff --git a/NvCloth/src/cuda/CuCloth.cpp b/NvCloth/src/cuda/CuCloth.cpp
index 3e6175b..4131b04 100644
--- a/NvCloth/src/cuda/CuCloth.cpp
+++ b/NvCloth/src/cuda/CuCloth.cpp
@@ -423,7 +423,7 @@ void CuCloth::clearParticleAccelerations()
{
CuContextLock contextLock(mFactory);
CuDeviceVector<PxVec4>(mFactory.mContext).swap(mParticleAccelerations);
- CuHostVector<PxVec4, CU_MEMHOSTALLOC_DEVICEMAP>::Type().swap(mParticleAccelerationsHostCopy);
+ CuHostVector<PxVec4, CU_MEMHOSTALLOC_DEVICEMAP>::Type(mFactory.mContext).swap(mParticleAccelerationsHostCopy);
wakeUp();
}
diff --git a/NvCloth/src/cuda/CuClothData.h b/NvCloth/src/cuda/CuClothData.h
index dd836fd..5f2d7c6 100644
--- a/NvCloth/src/cuda/CuClothData.h
+++ b/NvCloth/src/cuda/CuClothData.h
@@ -32,7 +32,7 @@
#include <foundation/Px.h>
#ifndef __CUDACC__
-#include "simd.h"
+#include "Simd.h"
#endif
namespace nv
diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp
index 9bc20db..957f912 100644
--- a/NvCloth/src/cuda/CuFabric.cpp
+++ b/NvCloth/src/cuda/CuFabric.cpp
@@ -31,6 +31,7 @@
#include "CuContextLock.h"
#include "CuFactory.h"
#include <PsUtilities.h>
+#include <limits.h>
using namespace physx;
diff --git a/NvCloth/src/cuda/CuPinnedAllocator.h b/NvCloth/src/cuda/CuPinnedAllocator.h
index 8b1787b..9939324 100644
--- a/NvCloth/src/cuda/CuPinnedAllocator.h
+++ b/NvCloth/src/cuda/CuPinnedAllocator.h
@@ -29,6 +29,8 @@
#pragma once
+#include <utility>
+
#include "CuCheckSuccess.h"
#include "NvCloth/Allocator.h"
@@ -102,7 +104,7 @@ public:
void destroy(T* ptr)
{
- core::unused(ptr);
+ PX_UNUSED(ptr);
ptr->~T();
}
@@ -122,13 +124,13 @@ bool operator!=(const CuHostAllocator<T1, Flag1>&, const CuHostAllocator<T2, Fla
return false;
}
-//Use CuHostVectorImpl instead of physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>>
+//Use CuHostVectorImpl instead of physx::shdfnd::Array<T, CuHostAllocator<T, Flags>>
//This entire class is just to make sure that the mDevicePtr from the CuHostAllocator is properly swapped together with mData
template <typename T, unsigned Flags = 0>
-class CuHostVectorImpl : public physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>>
+class CuHostVectorImpl : public physx::shdfnd::Array<T, CuHostAllocator<T, Flags>>
{
- typedef physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>> Super;
- typedef typename CuHostAllocator<T, Flags> Alloc;
+ typedef physx::shdfnd::Array<T, CuHostAllocator<T, Flags>> Super;
+ typedef CuHostAllocator<T, Flags> Alloc;
public:
explicit CuHostVectorImpl(const physx::PxEMPTY v):Super(v){}
PX_INLINE explicit CuHostVectorImpl(const Alloc& alloc = Alloc()):Super(alloc){}
@@ -142,10 +144,10 @@ public:
PX_INLINE explicit CuHostVectorImpl(const T* first, const T* last, const Alloc& alloc = Alloc()):Super(first,last,alloc){}
- void swap(physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>>& other)
+ void swap(CuHostVectorImpl<T, Flags>& other)
{
- PX_ASSERT(mContext == other.mContext);
- physx::shdfnd::swap(mDevicePtr, other.mDevicePtr);
+ NV_CLOTH_ASSERT(this->mContext == other.mContext);
+ physx::shdfnd::swap(this->mDevicePtr, other.mDevicePtr);
Super::swap(other);
}
};
diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp
index f0e328f..7ef1d32 100644
--- a/NvCloth/src/cuda/CuSolver.cpp
+++ b/NvCloth/src/cuda/CuSolver.cpp
@@ -302,6 +302,28 @@ cloth::CuSolver::~CuSolver()
mFactory.mSolverCount--;
}
+void cloth::CuSolver::addClothAppend(Cloth* cloth)
+{
+ CuCloth& cuCloth = *static_cast<CuCloth*>(cloth);
+
+ NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end());
+
+ mCloths.pushBack(&cuCloth);
+ // trigger update of mClothData array
+ cuCloth.notifyChanged();
+}
+
+void cloth::CuSolver::addClothUpdateData()
+{
+ CuContextLock contextLock(mFactory);
+
+ // resize containers and update kernel data
+ mClothDataHostCopy.resize(mCloths.size());
+ mClothData.resize(mCloths.size());
+ mFrameData.resize(mCloths.size());
+ updateKernelData();
+}
+
void cloth::CuSolver::updateKernelData()
{
mKernelDataHost.mClothIndex = mClothIndex.get();
@@ -326,24 +348,17 @@ struct ClothSimCostGreater
void cloth::CuSolver::addCloth(Cloth* cloth)
{
- CuCloth& cuCloth = *static_cast<CuCloth*>(cloth);
-
- NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end());
-
- mCloths.pushBack(&cuCloth);
- // trigger update of mClothData array
- cuCloth.notifyChanged();
-
- // sort cloth instances by size
- shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator());
-
- CuContextLock contextLock(mFactory);
+ addClothAppend(cloth);
+ addClothUpdateData();
+}
- // resize containers and update kernel data
- mClothDataHostCopy.resize(mCloths.size());
- mClothData.resize(mCloths.size());
- mFrameData.resize(mCloths.size());
- updateKernelData();
+void cloth::CuSolver::addCloths(Range<Cloth*> cloths)
+{
+ for (uint32_t i = 0; i < cloths.size(); ++i)
+ {
+ addClothAppend(*(cloths.begin() + i));
+ }
+ addClothUpdateData();
}
void cloth::CuSolver::removeCloth(Cloth* cloth)
@@ -401,7 +416,8 @@ void cloth::CuSolver::endSimulation()
int cloth::CuSolver::getSimulationChunkCount() const
{
- return 1;
+ // 0 chunks when no cloth present in the solver, 1 otherwise
+ return getNumCloths() != 0;
}
void cloth::CuSolver::beginFrame()
diff --git a/NvCloth/src/cuda/CuSolver.h b/NvCloth/src/cuda/CuSolver.h
index 0406e00..b4c6d6b 100644
--- a/NvCloth/src/cuda/CuSolver.h
+++ b/NvCloth/src/cuda/CuSolver.h
@@ -58,6 +58,7 @@ public:
~CuSolver();
virtual void addCloth(Cloth*) override;
+ virtual void addCloths(Range<Cloth*> cloths) override;
virtual void removeCloth(Cloth*) override;
virtual int getNumCloths() const override;
virtual Cloth * const * getClothList() const override;
@@ -103,6 +104,10 @@ public:
}
private:
+ // add cloth helper functions
+ void addClothAppend(Cloth* cloth);
+ void addClothUpdateData();
+
void updateKernelData(); // context needs to be acquired
// simulate helper functions
diff --git a/NvCloth/src/dx/DxBatchedVector.h b/NvCloth/src/dx/DxBatchedVector.h
index 2c5e313..76b9b22 100644
--- a/NvCloth/src/dx/DxBatchedVector.h
+++ b/NvCloth/src/dx/DxBatchedVector.h
@@ -297,7 +297,7 @@ class DxBatchedVector
void swap(DxBatchedVector<T>& other)
{
- PX_ASSERT(&mStorage == &other.mStorage);
+ NV_CLOTH_ASSERT(&mStorage == &other.mStorage);
physx::shdfnd::swap(mOffset, other.mOffset);
physx::shdfnd::swap(mSize, other.mSize);
physx::shdfnd::swap(mCapacity, other.mCapacity);
diff --git a/NvCloth/src/dx/DxClothData.h b/NvCloth/src/dx/DxClothData.h
index f91d37d..4da9be2 100644
--- a/NvCloth/src/dx/DxClothData.h
+++ b/NvCloth/src/dx/DxClothData.h
@@ -31,7 +31,7 @@
#pragma once
#include <foundation/Px.h>
-#include "simd.h"
+#include "Simd.h"
namespace nv
{
diff --git a/NvCloth/src/dx/DxFactory.cpp b/NvCloth/src/dx/DxFactory.cpp
index fbf0c51..91f5125 100644
--- a/NvCloth/src/dx/DxFactory.cpp
+++ b/NvCloth/src/dx/DxFactory.cpp
@@ -251,15 +251,15 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
void cloth::DxFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const
{
- PX_ASSERT(&cloth.getFactory() == this);
+ NV_CLOTH_ASSERT(&cloth.getFactory() == this);
const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
- PX_ASSERT(spheres.empty() || spheres.size() == dxCloth.mStartCollisionSpheres.size());
- PX_ASSERT(capsules.empty() || capsules.size() == dxCloth.mCapsuleIndices.size() * 2);
- PX_ASSERT(planes.empty() || planes.size() == dxCloth.mStartCollisionPlanes.size());
- PX_ASSERT(convexes.empty() || convexes.size() == dxCloth.mConvexMasks.size());
- PX_ASSERT(triangles.empty() || triangles.size() == dxCloth.mStartCollisionTriangles.size());
+ NV_CLOTH_ASSERT(spheres.empty() || spheres.size() == dxCloth.mStartCollisionSpheres.size());
+ NV_CLOTH_ASSERT(capsules.empty() || capsules.size() == dxCloth.mCapsuleIndices.size() * 2);
+ NV_CLOTH_ASSERT(planes.empty() || planes.size() == dxCloth.mStartCollisionPlanes.size());
+ NV_CLOTH_ASSERT(convexes.empty() || convexes.size() == dxCloth.mConvexMasks.size());
+ NV_CLOTH_ASSERT(triangles.empty() || triangles.size() == dxCloth.mStartCollisionTriangles.size());
// collision spheres are in pinned memory, so memcpy directly
if (!dxCloth.mStartCollisionSpheres.empty() && !spheres.empty())
@@ -296,13 +296,13 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
void cloth::DxFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
{
- PX_ASSERT(&cloth.getFactory() == this);
+ NV_CLOTH_ASSERT(&cloth.getFactory() == this);
const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
if (dxCloth.mMotionConstraints.mHostCopy.size())
{
- PX_ASSERT(destConstraints.size() == dxCloth.mMotionConstraints.mHostCopy.size());
+ NV_CLOTH_ASSERT(destConstraints.size() == dxCloth.mMotionConstraints.mHostCopy.size());
memcpy(destConstraints.begin(), dxCloth.mMotionConstraints.mHostCopy.begin(),
sizeof(PxVec4) * dxCloth.mMotionConstraints.mHostCopy.size());
@@ -315,20 +315,20 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
? dxCloth.mMotionConstraints.mTarget
: dxCloth.mMotionConstraints.mStart;
- PX_ASSERT(destConstraints.size() == srcConstraints.size());
+ NV_CLOTH_ASSERT(destConstraints.size() == srcConstraints.size());
copyToHost(destConstraints.begin(), srcConstraints.buffer(), 0, destConstraints.size() * sizeof(PxVec4));
}
}
void cloth::DxFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
{
- PX_ASSERT(&cloth.getFactory() == this);
+ NV_CLOTH_ASSERT(&cloth.getFactory() == this);
const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
if (dxCloth.mSeparationConstraints.mHostCopy.size())
{
- PX_ASSERT(destConstraints.size() == dxCloth.mSeparationConstraints.mHostCopy.size());
+ NV_CLOTH_ASSERT(destConstraints.size() == dxCloth.mSeparationConstraints.mHostCopy.size());
memcpy(destConstraints.begin(), dxCloth.mSeparationConstraints.mHostCopy.begin(),
sizeof(PxVec4) * dxCloth.mSeparationConstraints.mHostCopy.size());
@@ -341,7 +341,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
? dxCloth.mSeparationConstraints.mTarget
: dxCloth.mSeparationConstraints.mStart;
- PX_ASSERT(destConstraints.size() == srcConstraints.size());
+ NV_CLOTH_ASSERT(destConstraints.size() == srcConstraints.size());
copyToHost(destConstraints.begin(), srcConstraints.buffer(), 0, destConstraints.size() * sizeof(PxVec4));
}
@@ -350,12 +350,12 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
void cloth::DxFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const
{
/*
- PX_ASSERT(&cloth.getFactory() == this);
+ NV_CLOTH_ASSERT(&cloth.getFactory() == this);
const DxCloth& dxCloth = static_cast<const DxClothImpl&>(cloth).mCloth;
if (dxCloth.mParticleAccelerationsHostCopy.size())
{
- PX_ASSERT(dxCloth.mParticleAccelerationsHostCopy.size());
+ NV_CLOTH_ASSERT(dxCloth.mParticleAccelerationsHostCopy.size());
memcpy(destAccelerations.begin(), dxCloth.mParticleAccelerationsHostCopy.begin(),
sizeof(PxVec4) * dxCloth.mParticleAccelerationsHostCopy.size());
@@ -366,20 +366,20 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
DxBatchedVector<PxVec4> const& srcAccelerations = dxCloth.mParticleAccelerations;
- PX_ASSERT(destAccelerations.size() == srcAccelerations.size());
+ NV_CLOTH_ASSERT(destAccelerations.size() == srcAccelerations.size());
copyToHost(destAccelerations.begin(), srcAccelerations.buffer(), 0, destAccelerations.size() * sizeof(PxVec4));
}
*/
PX_UNUSED(&cloth);
PX_UNUSED(&destAccelerations);
- PX_ASSERT(0);
+ NV_CLOTH_ASSERT(0);
}
void cloth::DxFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
Range<PxVec3> destWeights) const
{
- PX_ASSERT(&cloth.getFactory() == this);
+ NV_CLOTH_ASSERT(&cloth.getFactory() == this);
DxContextLock contextLock(*this);
@@ -400,7 +400,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
for (; srcIt != srcEnd; ++srcIt, ++destIt)
*destIt = reinterpret_cast<const PxVec3&>(*srcIt);
- PX_ASSERT(destIt <= destWeights.end());
+ NV_CLOTH_ASSERT(destIt <= destWeights.end());
}
if (destIndices.size() > 0)
@@ -418,14 +418,14 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
for (; srcIt != srcEnd; ++srcIt, ++destIt)
*destIt = Vec4u(*srcIt);
- PX_ASSERT(&array(*destIt) <= destIndices.end());
+ NV_CLOTH_ASSERT(&array(*destIt) <= destIndices.end());
}
}
void cloth::DxFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const
{
const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
- PX_ASSERT(destIndices.size() == dxCloth.mSelfCollisionIndices.size());
+ NV_CLOTH_ASSERT(destIndices.size() == dxCloth.mSelfCollisionIndices.size());
intrinsics::memCopy(destIndices.begin(), dxCloth.mSelfCollisionIndicesHost.begin(),
destIndices.size() * sizeof(uint32_t));
}
@@ -433,7 +433,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
void cloth::DxFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const
{
const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
- PX_ASSERT(destRestPositions.size() == dxCloth.mRestPositions.size());
+ NV_CLOTH_ASSERT(destRestPositions.size() == dxCloth.mRestPositions.size());
intrinsics::memCopy(destRestPositions.begin(), DxCloth::MappedVec4fVectorType(const_cast<DxCloth&>(dxCloth).mRestPositions).begin(),
destRestPositions.size() * sizeof(PxVec4));
}
diff --git a/NvCloth/src/dx/DxSolver.cpp b/NvCloth/src/dx/DxSolver.cpp
index ab030d5..66a8d8f 100644
--- a/NvCloth/src/dx/DxSolver.cpp
+++ b/NvCloth/src/dx/DxSolver.cpp
@@ -113,26 +113,9 @@ struct ClothSimCostGreater
void cloth::DxSolver::addCloth(Cloth* cloth)
{
- DxCloth& dxCloth = static_cast<DxCloth&>(*cloth);
-
- NV_CLOTH_ASSERT(mCloths.find(&dxCloth) == mCloths.end());
-
- mCloths.pushBack(&dxCloth);
- // trigger update of mClothData array
- dxCloth.notifyChanged();
-
- // sort cloth instances by size
- shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator());
-
- DxContextLock contextLock(mFactory);
-
- // resize containers and update kernel data
- mClothDataHostCopy.resize(mCloths.size());
- mClothData.resize(mCloths.size());
- mFrameDataHostCopy.resize(mCloths.size());
-
- // lazy compilation of compute shader
- mComputeError |= mFactory.mSolverKernelComputeShader == nullptr;
+ addClothAppend(cloth);
+ addClothUpdateData();
+
#if 0
if (!mSortComputeShader && !mComputeError)
{
@@ -197,7 +180,7 @@ void cloth::DxSolver::addCloth(Cloth* cloth)
{
uint32_t key = sortElems[i] & ~0xffff;
uint32_t keyRef = _SortElemsRef[i] & ~0xffff;
- PX_ASSERT(key == keyRef);
+ NV_CLOTH_ASSERT(key == keyRef);
}
_SortElemsHostCopy.unmap();
}
@@ -208,6 +191,15 @@ void cloth::DxSolver::addCloth(Cloth* cloth)
#endif
}
+void cloth::DxSolver::addCloths(Range<Cloth*> cloths)
+{
+ for (uint32_t i = 0; i < cloths.size(); ++i)
+ {
+ addClothAppend(*(cloths.begin() + i));
+ }
+ addClothUpdateData();
+}
+
void cloth::DxSolver::removeCloth(Cloth* cloth)
{
DxCloth& dxCloth = static_cast<DxCloth&>(*cloth);
@@ -232,10 +224,9 @@ int cloth::DxSolver::getNumCloths() const
}
cloth::Cloth * const * cloth::DxSolver::getClothList() const
{
- if(getNumCloths())
+ if (getNumCloths() != 0)
return reinterpret_cast<Cloth* const*>(&mCloths[0]);
- else
- return nullptr;
+ return nullptr;
}
bool cloth::DxSolver::beginSimulation(float dt)
@@ -260,7 +251,34 @@ void cloth::DxSolver::endSimulation()
}
int cloth::DxSolver::getSimulationChunkCount() const
{
- return 1;
+ // 0 chunks when no cloth present in the solver, 1 otherwise
+ return getNumCloths() != 0;
+}
+
+void cloth::DxSolver::addClothAppend(Cloth* cloth)
+{
+ DxCloth& dxCloth = static_cast<DxCloth&>(*cloth);
+ NV_CLOTH_ASSERT(mCloths.find(&dxCloth) == mCloths.end());
+
+ mCloths.pushBack(&dxCloth);
+ // trigger update of mClothData array
+ dxCloth.notifyChanged();
+}
+
+void cloth::DxSolver::addClothUpdateData()
+{
+ // sort cloth instances by size
+ shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator());
+
+ DxContextLock contextLock(mFactory);
+
+ // resize containers and update kernel data
+ mClothDataHostCopy.resize(mCloths.size());
+ mClothData.resize(mCloths.size());
+ mFrameDataHostCopy.resize(mCloths.size());
+
+ // lazy compilation of compute shader
+ mComputeError |= mFactory.mSolverKernelComputeShader == nullptr;
}
void cloth::DxSolver::beginFrame()
diff --git a/NvCloth/src/dx/DxSolver.h b/NvCloth/src/dx/DxSolver.h
index 09f523a..07d77dc 100644
--- a/NvCloth/src/dx/DxSolver.h
+++ b/NvCloth/src/dx/DxSolver.h
@@ -56,6 +56,7 @@ class DxSolver : private DxContextLock, public Solver
~DxSolver();
virtual void addCloth(Cloth*) override;
+ virtual void addCloths(Range<Cloth*> cloths) override;
virtual void removeCloth(Cloth*) override;
virtual int getNumCloths() const override;
virtual Cloth * const * getClothList() const override;
@@ -101,6 +102,10 @@ class DxSolver : private DxContextLock, public Solver
}
private:
+ // add cloth helper functions
+ void addClothAppend(Cloth* cloth);
+ void addClothUpdateData();
+
// simulate helper functions
void beginFrame();
void executeKernel();
diff --git a/NvCloth/src/neon/NeonSolverKernel.cpp b/NvCloth/src/neon/NeonSolverKernel.cpp
index 4d6de68..3e16b6f 100644
--- a/NvCloth/src/neon/NeonSolverKernel.cpp
+++ b/NvCloth/src/neon/NeonSolverKernel.cpp
@@ -35,15 +35,19 @@
#include <cpu-features.h>
-namespace physx
+namespace
+{
+ const bool sNeonSupport = ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures();
+}
+
+namespace nv
{
namespace cloth
{
bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator,
- IterationStateFactory& factory, PxProfileZone* profileZone)
+ IterationStateFactory& factory)
{
- return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() &&
- (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true);
+ return sNeonSupport && (SwSolverKernel<Simd4f>(cloth, data, allocator, factory)(), true);
}
}
}
diff --git a/NvCloth/src/scalar/SwCollisionHelpers.h b/NvCloth/src/scalar/SwCollisionHelpers.h
index af21812..3ab756f 100644
--- a/NvCloth/src/scalar/SwCollisionHelpers.h
+++ b/NvCloth/src/scalar/SwCollisionHelpers.h
@@ -29,6 +29,8 @@
#pragma once
+#include "PsMathUtils.h"
+
namespace nv
{
namespace cloth
@@ -46,6 +48,7 @@ uint32_t findBitSet(uint32_t mask)
inline Scalar4i intFloor(const Scalar4f& v)
{
+ using physx::shdfnd::floor;
return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3])));
}
diff --git a/NvCloth/src/sse2/SwCollisionHelpers.h b/NvCloth/src/sse2/SwCollisionHelpers.h
index c80ba1d..b759868 100644
--- a/NvCloth/src/sse2/SwCollisionHelpers.h
+++ b/NvCloth/src/sse2/SwCollisionHelpers.h
@@ -63,12 +63,15 @@ Simd4i intFloor(const Simd4f& v)
{
Simd4i i = _mm_cvttps_epi32(v);
return _mm_sub_epi32(i, _mm_srli_epi32(simd4i(v), 31));
+ //Simd4i i = truncate(v);
+ //return i - (simd4i(v) >> 31);
}
Simd4i horizontalOr(const Simd4i& mask)
{
Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y
return tmp | _mm_shuffle_epi32(tmp, 0x4e); // w z y x -> y x w z
+// return splat<0>(mask) | splat<1>(mask) | splat<2>(mask) | splat<3>(mask);
}
Gather<Simd4i>::Gather(const Simd4i& index)