aboutsummaryrefslogtreecommitdiff
path: root/NvCloth/src
diff options
context:
space:
mode:
authorMarijn Tamis <[email protected]>2018-05-03 18:22:48 +0200
committerMarijn Tamis <[email protected]>2018-05-03 18:22:48 +0200
commitca32c59a58d37c1822e185a2d5f3d0d3e8943593 (patch)
treeb06b9eec03f34344ef8fc31aa147b2714d3962ee /NvCloth/src
parentForced rename of platform folders in cmake dir. Git didn't pick this up before. (diff)
downloadnvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.tar.xz
nvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.zip
NvCloth 1.1.4 Release. (24070740)
Diffstat (limited to 'NvCloth/src')
-rw-r--r--NvCloth/src/ClothBase.h1
-rw-r--r--NvCloth/src/ClothImpl.h21
-rw-r--r--NvCloth/src/IterationState.h14
-rw-r--r--NvCloth/src/PhaseConfig.cpp9
-rw-r--r--NvCloth/src/SwCloth.cpp21
-rw-r--r--NvCloth/src/SwClothData.cpp1
-rw-r--r--NvCloth/src/SwCollision.cpp137
-rw-r--r--NvCloth/src/SwFabric.cpp2
-rw-r--r--NvCloth/src/SwInterCollision.cpp36
-rw-r--r--NvCloth/src/SwSelfCollision.cpp1
-rw-r--r--NvCloth/src/SwSolverKernel.cpp11
-rw-r--r--NvCloth/src/TripletScheduler.cpp353
-rw-r--r--NvCloth/src/TripletScheduler.h27
-rw-r--r--NvCloth/src/cuda/CuCloth.h2
-rw-r--r--NvCloth/src/cuda/CuCollision.h7
-rw-r--r--NvCloth/src/cuda/CuFabric.cpp3
-rw-r--r--NvCloth/src/cuda/CuFactory.cpp7
-rw-r--r--NvCloth/src/cuda/CuSolverKernelBlob.h0
-rw-r--r--NvCloth/src/dx/DxCloth.cpp6
-rw-r--r--NvCloth/src/dx/DxCloth.h10
-rw-r--r--NvCloth/src/dx/DxClothData.cpp2
-rw-r--r--NvCloth/src/dx/DxClothData.h4
-rw-r--r--NvCloth/src/dx/DxFabric.cpp41
-rw-r--r--NvCloth/src/dx/DxFactory.cpp22
-rw-r--r--NvCloth/src/dx/DxFactory.h12
-rw-r--r--NvCloth/src/dx/DxSolver.cpp21
-rw-r--r--NvCloth/src/dx/DxSolverKernel.hlsl331
-rw-r--r--NvCloth/src/scalar/SwCollisionHelpers.h5
-rw-r--r--NvCloth/src/sse2/SwCollisionHelpers.h22
29 files changed, 768 insertions, 361 deletions
diff --git a/NvCloth/src/ClothBase.h b/NvCloth/src/ClothBase.h
index ec1ee40..faca6c9 100644
--- a/NvCloth/src/ClothBase.h
+++ b/NvCloth/src/ClothBase.h
@@ -64,6 +64,7 @@ void initialize(Cloth& cloth, const physx::PxVec4* pIt, const physx::PxVec4* pEn
cloth.mCurrentMotion = physx::PxTransform(physx::PxIdentity);
cloth.mLinearVelocity = physx::PxVec3(0.0f);
cloth.mAngularVelocity = physx::PxVec3(0.0f);
+ cloth.mIgnoreVelocityDiscontinuityNextFrame = false;
cloth.mPrevIterDt = 0.0f;
cloth.mIterDtAvg = MovingAverage(30);
cloth.mTetherConstraintLogStiffness = float(-FLT_MAX_EXP);
diff --git a/NvCloth/src/ClothImpl.h b/NvCloth/src/ClothImpl.h
index 24f7732..9bc4c36 100644
--- a/NvCloth/src/ClothImpl.h
+++ b/NvCloth/src/ClothImpl.h
@@ -33,6 +33,8 @@
#include "NvCloth/Fabric.h"
#include <foundation/PxVec4.h>
#include <foundation/PxVec3.h>
+#include "IndexPair.h"
+#include "MovingAverage.h"
#include <PsMathUtils.h>
#include <cmath>
@@ -77,6 +79,9 @@ class ClothImpl : public Cloth
virtual void clearInertia();
virtual void teleport(const physx::PxVec3& delta);
+ virtual void teleportToLocation(const physx::PxVec3& translation, const physx::PxQuat& rotation);
+
+ virtual void ignoreVelocityDiscontinuity();
virtual float getPreviousIterationDt() const;
virtual void setGravity(const physx::PxVec3& gravity);
@@ -210,6 +215,7 @@ public: //Fields shared between all cloth classes
physx::PxTransform mCurrentMotion;
physx::PxVec3 mLinearVelocity;
physx::PxVec3 mAngularVelocity;
+ bool mIgnoreVelocityDiscontinuityNextFrame;
float mPrevIterDt;
MovingAverage mIterDtAvg;
@@ -292,6 +298,21 @@ inline void ClothImpl<T>::teleport(const physx::PxVec3& delta)
}
template <typename T>
+inline void ClothImpl<T>::teleportToLocation(const physx::PxVec3& translation, const physx::PxQuat& rotation)
+{
+ mCurrentMotion.p = translation;
+ mCurrentMotion.q = rotation;
+ mTargetMotion.p = translation;
+ mTargetMotion.q = rotation;
+}
+
+template <typename T>
+inline void ClothImpl<T>::ignoreVelocityDiscontinuity()
+{
+ mIgnoreVelocityDiscontinuityNextFrame = true;
+}
+
+template <typename T>
inline float ClothImpl<T>::getPreviousIterationDt() const
{
return mPrevIterDt;
diff --git a/NvCloth/src/IterationState.h b/NvCloth/src/IterationState.h
index e18b636..be046b5 100644
--- a/NvCloth/src/IterationState.h
+++ b/NvCloth/src/IterationState.h
@@ -183,11 +183,15 @@ cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameD
mPrevLinearVelocity = cloth.mLinearVelocity;
mPrevAngularVelocity = cloth.mAngularVelocity;
- // update cloth
- float invFrameDt = 1.0f / frameDt;
- cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p);
- physx::PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate();
- cloth.mAngularVelocity = log(dq) * invFrameDt;
+ if(!cloth.mIgnoreVelocityDiscontinuityNextFrame)
+ {
+ // update cloth
+ float invFrameDt = 1.0f / frameDt;
+ cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p);
+ physx::PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate();
+ cloth.mAngularVelocity = log(dq) * invFrameDt;
+ }
+ cloth.mIgnoreVelocityDiscontinuityNextFrame = false;
cloth.mPrevIterDt = mIterDt;
cloth.mIterDtAvg.push(static_cast<uint32_t>(mNumIterations), mIterDt);
diff --git a/NvCloth/src/PhaseConfig.cpp b/NvCloth/src/PhaseConfig.cpp
index 7ea97f3..b5db3b0 100644
--- a/NvCloth/src/PhaseConfig.cpp
+++ b/NvCloth/src/PhaseConfig.cpp
@@ -30,6 +30,7 @@
#include "NvCloth/PhaseConfig.h"
#include "PsMathUtils.h"
#include <algorithm>
+#include "ClothImpl.h"
using namespace physx;
@@ -43,14 +44,6 @@ PhaseConfig transform(const PhaseConfig&);
using namespace nv;
-namespace
-{
-float safeLog2(float x)
-{
- float saturated = std::max(0.0f, std::min(x, 1.0f));
- return saturated ? shdfnd::log2(saturated) : -FLT_MAX_EXP;
-}
-}
// convert from user input to solver format
cloth::PhaseConfig cloth::transform(const PhaseConfig& config)
diff --git a/NvCloth/src/SwCloth.cpp b/NvCloth/src/SwCloth.cpp
index 6077ada..46a7784 100644
--- a/NvCloth/src/SwCloth.cpp
+++ b/NvCloth/src/SwCloth.cpp
@@ -269,28 +269,13 @@ void SwCloth::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const
// shuffle indices to form independent SIMD sets
uint16_t numParticles = uint16_t(mCurParticles.size());
- TripletScheduler scheduler(indices);
+ TripletScheduler scheduler(indices); //the TripletScheduler makes a copy so indices is not modified
scheduler.simd(numParticles, 4);
- // convert indices to byte offset
- Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0);
- Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin();
- Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end();
- TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast;
- mVirtualParticleIndices.resize(0);
- mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt));
- for (; sIt != sEnd; ++sIt)
- {
- uint32_t setSize = *sIt;
- for (tLast = tIt + setSize; tIt != tLast; ++tIt, ++mNumVirtualParticles)
- mVirtualParticleIndices.pushBack(Vec4us(*tIt));
- mVirtualParticleIndices.resize((mVirtualParticleIndices.size() + 3) & ~3, dummy);
- }
- Vector<Vec4us>::Type(mVirtualParticleIndices.begin(), mVirtualParticleIndices.end())
- .swap(mVirtualParticleIndices);
+ mVirtualParticleIndices.swap(scheduler.mPaddedTriplets);
// precompute 1/dot(w,w)
- Vector<PxVec4>::Type().swap(mVirtualParticleWeights);
+ Vector<PxVec4>::Type().swap(mVirtualParticleWeights); //clear and trim
mVirtualParticleWeights.reserve(weights.size());
for (; !weights.empty(); weights.popFront())
{
diff --git a/NvCloth/src/SwClothData.cpp b/NvCloth/src/SwClothData.cpp
index f102bde..84d85d4 100644
--- a/NvCloth/src/SwClothData.cpp
+++ b/NvCloth/src/SwClothData.cpp
@@ -73,6 +73,7 @@ cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric)
mTethers = fabric.mTethers.begin();
mNumTethers = uint32_t(fabric.mTethers.size());
+ //1-(1 - stiffness)^stiffnessExponent
mTetherConstraintStiffness = 1.0f - expf(stiffnessExponent * cloth.mTetherConstraintLogStiffness);
mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale;
diff --git a/NvCloth/src/SwCollision.cpp b/NvCloth/src/SwCollision.cpp
index 0aa196d..bd6cf7a 100644
--- a/NvCloth/src/SwCollision.cpp
+++ b/NvCloth/src/SwCollision.cpp
@@ -167,7 +167,7 @@ void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres,
PxVec4 center = (second + first) * 0.5f;
PxVec4 axis = (second - first) * 0.5f; //half axis
- //axiw.w = half of radii difference
+ //axis.w = half of radii difference
// |Axis|^2
float sqrAxisHalfLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
@@ -426,8 +426,8 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt)
T4f radius = splat<3>(sphere);
//calculate the first and last cell index, for each axis, that contains the sphere
- T4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero));
- T4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
+ T4i first = intFloor(min(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero), sGridLength)); //use both min and max to deal with bad grid scales
+ T4i last = intFloor(min(max((sphere + radius) * mGridScale + mGridBias, gSimd4fZero), sGridLength));
const int* firstIdx = array(first);
const int* lastIdx = array(last);
@@ -555,40 +555,47 @@ operator &= (const ShapeMask& right)
return *this;
}
+// get collision shape masks from a single cell from a single axis of the acceleration structure
template <typename T4f>
FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask
cloth::SwCollision<T4f>::getShapeMask(const T4f& position, const T4i* __restrict sphereGrid,
const T4i* __restrict coneGrid)
{
+ // position are the grid positions along a single axis (x, y, or z)
Gather<T4i> gather(intFloor(position));
+ // get the bitmask indicating which cones/spheres overlap the grid cell
ShapeMask result;
result.mCones = gather(coneGrid);
result.mSpheres = gather(sphereGrid);
return result;
}
-// lookup acceleration structure and return mask of potential intersectors
+// lookup acceleration structure and return mask of potential intersection collision shapes
template <typename T4f>
FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask
cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict positions) const
{
+ // positions are the particle positions
T4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias);
T4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias);
T4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias);
- ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid);
- result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2);
- result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4);
+ // AND together the bit masks so only the cones/spheres remain
+ // that overlap with the particle posision on all axis
+ ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid); // X
+ result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2); // Y
+ result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4); // Z
return result;
}
-// lookup acceleration structure and return mask of potential intersectors
+// lookup acceleration structure and return mask of potential intersection collision shapes for CCD
template <typename T4f>
FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask
cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict prevPos, const T4f* __restrict curPos) const
{
+ // same as getShapeMask(const T4f* __restrict positions) but for continuous collision detection
T4f scaleX = splat<0>(mGridScale);
T4f scaleY = splat<1>(mGridScale);
T4f scaleZ = splat<2>(mGridScale);
@@ -605,22 +612,24 @@ cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict prevPos, const T4f*
T4f curY = curPos[1] * scaleY + biasY;
T4f curZ = curPos[2] * scaleZ + biasZ;
+ // get maximum extent corner of the AABB containing both prevPos and curPos
T4f maxX = min(max(prevX, curX), sGridLength);
T4f maxY = min(max(prevY, curY), sGridLength);
T4f maxZ = min(max(prevZ, curZ), sGridLength);
- ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid);
- result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2);
- result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4);
+ ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid); // X
+ result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2); // Y
+ result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4); // Z
+ // get min extent corner of the AABB containing both prevPos and curPos
T4f zero = gSimd4fZero;
T4f minX = max(min(prevX, curX), zero);
T4f minY = max(min(prevY, curY), zero);
T4f minZ = max(min(prevZ, curZ), zero);
- result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6);
- result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8);
- result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10);
+ result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6); // X
+ result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8); // Y
+ result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10); // Z
return result;
}
@@ -770,7 +779,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
T4f axisZ = splat<2>(axis);
T4f slope = splat<3>(axis);
- // project delta onto axis
+ // distance along cone axis (from center)
T4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
// interpolate radius
T4f radius = dot * slope + splat<3>(center);
@@ -901,7 +910,7 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
T4f dotCurCur = sqrDistance - curRadius * curRadius;
T4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
- T4f sqrtD = sqrt(discriminant);
+ T4f sqrtD = sqrt(discriminant); //we get -nan if there are no roots
T4f halfB = dotPrevCur - dotPrevPrev;
T4f minusA = dotPrevCur - dotCurCur + halfB;
@@ -914,7 +923,7 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
T4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
- // a is negative when one sphere is contained in the other,
+ // a is negative when one relative sphere is contained in the other,
// which is already handled by discrete collision.
collisionMask = collisionMask & (minusA < -static_cast<T4f>(gSimd4fEpsilon));
@@ -931,10 +940,15 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
T4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
+
+ //Move curXYZ to toi point
curX = curX + deltaX * oneMinusToi;
curY = curY + deltaY * oneMinusToi;
curZ = curZ + deltaZ * oneMinusToi;
+ //curXYZ is now touching the sphere at toi
+ //Note that curXYZ is also relative to the sphere center at toi
+ //We assume that the point sticks to the sphere until the end of the frame
curPos[0] = splat<0>(curSphere) + curX;
curPos[1] = splat<1>(curSphere) + curY;
curPos[2] = splat<2>(curSphere) + curZ;
@@ -1000,7 +1014,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
T4f prevT = prevY * prevAxisZ - prevZ * prevAxisY;
T4f prevU = prevZ * prevAxisX - prevX * prevAxisZ;
T4f prevV = prevX * prevAxisY - prevY * prevAxisX;
- T4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+ T4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; //distance along the axis
T4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter);
T4f curCenter = loadAligned(curCenterPtr, offset);
@@ -1014,18 +1028,22 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
T4f curX = curPos[0] - splat<0>(curCenter);
T4f curY = curPos[1] - splat<1>(curCenter);
T4f curZ = curPos[2] - splat<2>(curCenter);
+ //curTUV = cross(curXYZ, curAxisXYZ)
T4f curT = curY * curAxisZ - curZ * curAxisY;
T4f curU = curZ * curAxisX - curX * curAxisZ;
T4f curV = curX * curAxisY - curY * curAxisX;
T4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
T4f curRadius = curDot * curSlope + splat<3>(curCenter);
+ //Magnitude of cross product gives area of parallelogram |curAxisXYZ|*parallelogramHeight, |curAxisXYZ|=1
+ //parallelogramHeight is distance between the axis and the point
T4f curSqrDistance = gSimd4fEpsilon + curT * curT + curU * curU + curV * curV;
// set radius to zero if cone is culled
prevRadius = max(prevRadius, gSimd4fZero) & ~culled;
curRadius = max(curRadius, gSimd4fZero) & ~culled;
+ //Use quadratic equation to solve for time of impact (against infinite cone)
T4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
T4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
T4f dotCurCur = curSqrDistance - curRadius * curRadius;
@@ -1060,6 +1078,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
T4f posY = prevY - deltaY * toi;
T4f posZ = prevZ - deltaZ * toi;
+ // axisHalfLength
T4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary));
T4i prevAuxiliary = loadAligned(prevAuxiliaryPtr, offset);
T4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary));
@@ -1074,19 +1093,26 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
T4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
T4f invHalfLength = rsqrt(sqrHalfLength);
+ // distance along toi cone axis (from center)
T4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
+ //point line distance
T4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
T4f invDistance = rsqrt(sqrDistance) & (sqrDistance > gSimd4fZero);
+ //offset base to take slope in to account
T4f base = dot + slope * sqrDistance * invDistance;
T4f scale = base * invHalfLength & collisionMask;
+ // use invHalfLength to map base from [-HalfLength, +HalfLength]=inside to [-1, +1] =inside
+ // test if any impact position is in cone section
T4f cullMask = (abs(scale) < gSimd4fOne) & collisionMask;
// test if any impact position is in cone section
if (!allEqual(cullMask, gSimd4fZero))
{
+ //calculate unnormalized normal delta?
+ //delta = prev - cur - (prevAxis - curScaledAxis)*scale
deltaX = deltaX + splat<0>(deltaScaledAxis) * scale;
deltaY = deltaY + splat<1>(deltaScaledAxis) * scale;
deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale;
@@ -1099,15 +1125,19 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
T4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
- curX = curX + deltaX * oneMinusToi;
+ //curX = cur + (prev - cur - (prevAxis - curScaledAxis)*scale)*(1-toi)
+ //curX = cur + (prev - cur)*(1-toi) - ((prevAxis - curScaledAxis)*scale)*(1-toi)
+ curX = curX + deltaX * oneMinusToi;
curY = curY + deltaY * oneMinusToi;
curZ = curZ + deltaZ * oneMinusToi;
+ //save adjusted values for discrete collision detection below
curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
curRadius = curDot * curSlope + splat<3>(curCenter);
curRadius = max(curRadius, gSimd4fZero) & ~culled;
curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
+ //take offset from toi and add it to the cone center at the end
curPos[0] = splat<0>(curCenter) + curX;
curPos[1] = splat<1>(curCenter) + curY;
curPos[2] = splat<2>(curCenter) + curZ;
@@ -1210,7 +1240,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
T4f nz = deltaZ * rcpDelta;
// calculate relative velocity
- // velXYZ is scaled by one over the number of collisions since all collisions accumulate into
+ // velXYZ is scaled by one over the number of collisions (scale) since all collisions accumulate into
// that variable during collision detection
T4f rvx = curPos[0] - prevPos[0] - velX * scale;
T4f rvy = curPos[1] - prevPos[1] - velY * scale;
@@ -1227,7 +1257,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
// calculate magnitude of relative tangential velocity
T4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon);
- // magnitude of friction impulse (cannot be greater than -vt)
+ // magnitude of friction impulse (cannot be greater than -rvt)
T4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, gSimd4fMinusOne) & mask;
impulse[0] = rvtx * j;
@@ -1244,7 +1274,7 @@ void cloth::SwCollision<T4f>::collideParticles()
const T4f massScale = simd4f(mClothData.mCollisionMassScale);
const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
- const T4f frictionScale = simd4f(mClothData.mFrictionScale); //[arameter set by user
+ const T4f frictionScale = simd4f(mClothData.mFrictionScale); //Parameter set by user
T4f curPos[4];
T4f prevPos[4];
@@ -1439,21 +1469,22 @@ void cloth::SwCollision<T4f>::collideVirtualParticles()
transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]);
- q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]);
- q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]);
- q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]);
-
- q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]);
- q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]);
- q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]);
-
- q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]);
- q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]);
- q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]);
-
- q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]);
- q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]);
- q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]);
+ // (weight * frictionImpulse ) & mask away if particle is static
+ q0v0 = q0v0 - ((splat<0>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q0v1 = q0v1 - ((splat<1>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q0v2 = q0v2 - ((splat<2>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero));
+
+ q1v0 = q1v0 - ((splat<0>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q1v1 = q1v1 - ((splat<1>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q1v2 = q1v2 - ((splat<2>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero));
+
+ q2v0 = q2v0 - ((splat<0>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q2v1 = q2v1 - ((splat<1>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q2v2 = q2v2 - ((splat<2>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero));
+
+ q3v0 = q3v0 - ((splat<0>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q3v1 = q3v1 - ((splat<1>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero));
+ q3v2 = q3v2 - ((splat<2>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero));
// write back prev particles
storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0);
@@ -1487,6 +1518,7 @@ void cloth::SwCollision<T4f>::collideVirtualParticles()
T4f s2 = gSimd4fOne + splat<2>(weightScale) * (w2 & splat<2>(mask));
T4f s3 = gSimd4fOne + splat<3>(weightScale) * (w3 & splat<3>(mask));
+ //we don't care if particles are static here since we are only scaling (zero for static) mass
p0v0 = p0v0 * (gSimd4fOneXYZ | (splat<0>(s0) & sMaskW));
p0v1 = p0v1 * (gSimd4fOneXYZ | (splat<1>(s0) & sMaskW));
p0v2 = p0v2 * (gSimd4fOneXYZ | (splat<2>(s0) & sMaskW));
@@ -1504,21 +1536,22 @@ void cloth::SwCollision<T4f>::collideVirtualParticles()
p3v2 = p3v2 * (gSimd4fOneXYZ | (splat<2>(s3) & sMaskW));
}
- p0v0 = p0v0 + (splat<0>(rw0) * d0);
- p0v1 = p0v1 + (splat<1>(rw0) * d0);
- p0v2 = p0v2 + (splat<2>(rw0) * d0);
-
- p1v0 = p1v0 + (splat<0>(rw1) * d1);
- p1v1 = p1v1 + (splat<1>(rw1) * d1);
- p1v2 = p1v2 + (splat<2>(rw1) * d1);
-
- p2v0 = p2v0 + (splat<0>(rw2) * d2);
- p2v1 = p2v1 + (splat<1>(rw2) * d2);
- p2v2 = p2v2 + (splat<2>(rw2) * d2);
-
- p3v0 = p3v0 + (splat<0>(rw3) * d3);
- p3v1 = p3v1 + (splat<1>(rw3) * d3);
- p3v2 = p3v2 + (splat<2>(rw3) * d3);
+ // (weight * delta) & mask away if particle is static
+ p0v0 = p0v0 + ((splat<0>(rw0) * d0) & ~(splat<3>(p0v0) == gSimd4fZero));
+ p0v1 = p0v1 + ((splat<1>(rw0) * d0) & ~(splat<3>(p0v1) == gSimd4fZero));
+ p0v2 = p0v2 + ((splat<2>(rw0) * d0) & ~(splat<3>(p0v2) == gSimd4fZero));
+
+ p1v0 = p1v0 + ((splat<0>(rw1) * d1) & ~(splat<3>(p1v0) == gSimd4fZero));
+ p1v1 = p1v1 + ((splat<1>(rw1) * d1) & ~(splat<3>(p1v1) == gSimd4fZero));
+ p1v2 = p1v2 + ((splat<2>(rw1) * d1) & ~(splat<3>(p1v2) == gSimd4fZero));
+
+ p2v0 = p2v0 + ((splat<0>(rw2) * d2) & ~(splat<3>(p2v0) == gSimd4fZero));
+ p2v1 = p2v1 + ((splat<1>(rw2) * d2) & ~(splat<3>(p2v1) == gSimd4fZero));
+ p2v2 = p2v2 + ((splat<2>(rw2) * d2) & ~(splat<3>(p2v2) == gSimd4fZero));
+
+ p3v0 = p3v0 + ((splat<0>(rw3) * d3) & ~(splat<3>(p3v0) == gSimd4fZero));
+ p3v1 = p3v1 + ((splat<1>(rw3) * d3) & ~(splat<3>(p3v1) == gSimd4fZero));
+ p3v2 = p3v2 + ((splat<2>(rw3) * d3) & ~(splat<3>(p3v2) == gSimd4fZero));
// write back particles
storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0);
diff --git a/NvCloth/src/SwFabric.cpp b/NvCloth/src/SwFabric.cpp
index 9830398..b8d617f 100644
--- a/NvCloth/src/SwFabric.cpp
+++ b/NvCloth/src/SwFabric.cpp
@@ -162,7 +162,7 @@ uint32_t cloth::SwFabric::getNumStiffnessValues() const
uint32_t cloth::SwFabric::getNumSets() const
{
- return uint32_t(mSets.size() - 1);
+ return uint32_t(physx::PxMax(0, static_cast<int>(mSets.size() - 1)));
}
uint32_t cloth::SwFabric::getNumIndices() const
diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp
index 50be414..bc46ea6 100644
--- a/NvCloth/src/SwInterCollision.cpp
+++ b/NvCloth/src/SwInterCollision.cpp
@@ -49,6 +49,7 @@ const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0));
const Simd4fScalarFactory sEpsilon = simd4f(FLT_EPSILON);
const Simd4fTupleFactory sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+// Same as radixSort from SwSelfCollision.cpp but with uint32_t instead of uint16_t
// returns sorted indices, output needs to be at least 2*(last - first) + 1024
void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out)
{
@@ -226,13 +227,15 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
uint32_t* sortedIndices = static_cast<uint32_t*>(allocator.allocate(numCloths * sizeof(uint32_t)));
+ // fill clothBounds, sortedIndices, and calculate totalClothBounds in world space
for (uint32_t i = 0; i < numCloths; ++i)
{
const SwInterCollisionData& c = cBegin[i];
- // transform bounds from b local space to local space of a
+ // grow bounds with the collision distance colDist
PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
NV_CLOTH_ASSERT(!lcBounds.isEmpty());
+ // transform bounds to world space
PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose,lcBounds);
BoundingBox cBounds = { simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f),
@@ -244,9 +247,11 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
totalClothBounds = expandBounds(totalClothBounds, cBounds);
}
- // sort indices by their minimum extent on the longest axis
+ // The sweep axis is the longest extent of totalClothBounds
+ // 0 = x axis, 1 = y axis, etc. so that vectors can be indexed using v[sweepAxis]
const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower);
+ // sort indices by their minimum extent on the sweep axis
ClothSorter<T4f> predicate(clothBounds, numCloths, sweepAxis);
shdfnd::sort(sortedIndices, numCloths, predicate, nv::cloth::NonTrackingAllocator());
@@ -270,10 +275,10 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
uint32_t overlapMask = 0;
uint32_t numOverlaps = 0;
- // scan back to find first intersecting bounding box
- uint32_t startIndex = i;
- while (startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin)
- --startIndex;
+ // scan forward to skip non intersecting bounds
+ uint32_t startIndex = 0;
+ while(startIndex < numCloths && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] < axisMin)
+ startIndex++;
// compute all overlapping bounds
for (uint32_t j = startIndex; j < numCloths; ++j)
@@ -361,7 +366,7 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
// output each particle only once
++numParticles;
- break;
+ break; // the particle only has to be inside one of the bounds, it doesn't matter if they are in more than one
}
}
}
@@ -401,7 +406,7 @@ void cloth::SwInterCollision<T4f>::operator()()
// world bounds of particles
BoundingBox<T4f> bounds = emptyBounds<T4f>();
- // calculate potentially colliding set
+ // calculate potentially colliding set (based on bounding boxes)
{
NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::BroadPhase", /*ProfileContext::None*/ 0);
@@ -415,6 +420,8 @@ void cloth::SwInterCollision<T4f>::operator()()
{
NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::Collide", /*ProfileContext::None*/ 0);
+ //Note: this code is almost the same as cloth::SwSelfCollision<T4f>::operator()
+
T4f lowerBound = bounds.mLower;
T4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon);
@@ -423,11 +430,12 @@ void cloth::SwInterCollision<T4f>::operator()()
uint32_t hashAxis0 = (sweepAxis + 1) % 3;
uint32_t hashAxis1 = (sweepAxis + 2) % 3;
- // reserve 0, 127, and 65535 for sentinel
+ // reserve 0, 255, and 65535 for sentinel
T4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
T4f one = gSimd4fOne;
+ // +1 for sentinel 0 offset
T4f gridSize = simd4f(254.0f);
array(gridSize)[sweepAxis] = 65534.0f;
@@ -564,6 +572,7 @@ size_t cloth::SwInterCollision<T4f>::getBufferSize(uint32_t numParticles)
template <typename T4f>
void cloth::SwInterCollision<T4f>::collideParticle(uint32_t index)
{
+ // The other particle is passed through member variables (mParticle)
uint16_t clothIndex = mClothIndices[index];
if ((1 << clothIndex) & ~mClothMask)
@@ -574,6 +583,8 @@ void cloth::SwInterCollision<T4f>::collideParticle(uint32_t index)
uint32_t particleIndex = mParticleIndices[index];
T4f& particle = reinterpret_cast<T4f&>(instance->mParticles[particleIndex]);
+
+ //very similar to cloth::SwSelfCollision<T4f>::collideParticles
T4f diff = particle - mParticle;
T4f distSqr = dot3(diff, diff);
@@ -609,6 +620,8 @@ void cloth::SwInterCollision<T4f>::collideParticles(const uint32_t* keys, uint32
const uint32_t* indices, uint32_t numParticles,
uint32_t collisionDistance)
{
+ //very similar to cloth::SwSelfCollision<T4f>::collideParticles
+
const uint32_t bucketMask = uint16_t(-1);
const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
@@ -639,8 +652,9 @@ void cloth::SwInterCollision<T4f>::collideParticles(const uint32_t* keys, uint32
++kIt;
kLast[k] = kIt;
- // jump forward once to second column
- kIt = keys + firstColumnSize;
+ // jump forward once to second column to go from cell offset 1 to 2 quickly
+ if(firstColumnSize)
+ kIt = keys + firstColumnSize;
firstColumnSize = 0;
}
}
diff --git a/NvCloth/src/SwSelfCollision.cpp b/NvCloth/src/SwSelfCollision.cpp
index ec5a166..f39217b 100644
--- a/NvCloth/src/SwSelfCollision.cpp
+++ b/NvCloth/src/SwSelfCollision.cpp
@@ -45,6 +45,7 @@ namespace
// returns sorted indices, output needs to be at least 2*(last - first) + 1024
void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
{
+ // Note: This function is almost exactly duplicated in SwInterCollision.cpp
// this sort uses a radix (bin) size of 256, requiring 4 bins to sort the 32 bit keys
uint16_t n = uint16_t(last - first);
diff --git a/NvCloth/src/SwSolverKernel.cpp b/NvCloth/src/SwSolverKernel.cpp
index 2181b1e..ab612e2 100644
--- a/NvCloth/src/SwSolverKernel.cpp
+++ b/NvCloth/src/SwSolverKernel.cpp
@@ -276,6 +276,8 @@ void constrainSeparation(T4f* __restrict curIt, const T4f* __restrict curEnd, co
}
}
+
+
/**
traditional gauss-seidel internal constraint solver
*/
@@ -350,7 +352,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons
erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit));
}
- //ex = er * stiffness / sqrt(epsilon + vw)
+ //ex = er * stiffness / (epsilon + inMass sum)
T4f exij = erij * stij * recip(gSimd4fEpsilon + vwij);
//h = h * ex
@@ -375,7 +377,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons
#include "sse2/SwSolveConstraints.h"
#endif
-// calculates upper bound of all position deltas
+// Calculates upper bound of all position deltas
template <typename T4f>
T4f calculateMaxDelta(const T4f* prevIt, const T4f* curIt, const T4f* curEnd)
{
@@ -391,6 +393,11 @@ void applyWind(T4f* __restrict curIt, const T4f* __restrict prevIt, const uint16
const uint16_t* __restrict tEnd, float itrDtf, float dragCoefficientf, float liftCoefficientf, float fluidDensityf, T4f wind,
const T4f (&rotation)[3])
{
+ // Note: Enabling wind can amplify bad behavior since the impulse scales with area,
+ // and the area of triangles increases when constraints are violated.
+ // Using the initial triangle area based on the rest length is one possible way to
+ // prevent this, but is expensive (and incorrect for intentionally stretchy cloth).
+
const T4f dragCoefficient = simd4f(dragCoefficientf);
const T4f liftCoefficient = simd4f(liftCoefficientf);
const T4f fluidDensity = simd4f(fluidDensityf);
diff --git a/NvCloth/src/TripletScheduler.cpp b/NvCloth/src/TripletScheduler.cpp
index 999be0e..0116200 100644
--- a/NvCloth/src/TripletScheduler.cpp
+++ b/NvCloth/src/TripletScheduler.cpp
@@ -39,45 +39,132 @@ cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets)
{
}
-// SSE version
+// helper functions for TripletScheduler::simd()
+namespace
+{
+ //linear search should be fine for small particlesInBatchSize values
+ bool isIndexInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+ {
+ for(int i = 0; i < particlesInBatchSize; i++)
+ {
+ if(particlesInBatch[i] == 0xffffffff)
+ return false;
+ if(index1 == particlesInBatch[i] || index2 == particlesInBatch[i] || index3 == particlesInBatch[i])
+ return true;
+ }
+ return false;
+ }
+
+ void markIndicesInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+ {
+ for(int i = 0; i < particlesInBatchSize - 2; i++)
+ {
+ if(particlesInBatch[i] == 0xffffffff)
+ {
+ NV_CLOTH_ASSERT(i + 2 < particlesInBatchSize);
+ particlesInBatch[i] = index1;
+ particlesInBatch[i + 1] = index2;
+ particlesInBatch[i + 2] = index3;
+ break;
+ }
+ }
+ }
+}
+
+/*
+Group triplets in batches of simdWith so that they can be processed in parallel without referencing the same particle.
+Not suitable for simdWidth with large values due to linear search.
+Note that sets can be larger that simdWidth, and that particles may be referenced from the same set multiple times,
+ as long as they are not referenced more than once within a batch of simdWidth. The batch resets at the start of
+ each set.
+*/
void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth)
{
- if (mTriplets.empty())
+ PX_UNUSED(numParticles);
+ if(mTriplets.empty())
return;
- Vector<uint32_t>::Type mark(numParticles, uint32_t(-1));
+ const int particlesInBatchSize = simdWidth * 3;
+ uint32_t* particlesInBatch = new uint32_t[particlesInBatchSize]; //used to mark used indices in current batch
+ int setSize = 0; //number of triplets in current set
+ int amountOfPaddingNeeded = 0;
- uint32_t setIndex = 0, setSize = 0;
- for (TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex)
+ for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; tIt++)
{
- TripletIter tLast = tIt + std::min(simdWidth, uint32_t(tEnd - tIt));
- TripletIter tSwap = tEnd;
-
- for (; tIt != tLast && tIt != tSwap; ++tIt, ++setSize)
+ if(setSize%simdWidth == 0)
{
- // swap from tail until independent triplet found
- while ((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap)
- std::iter_swap(tIt, tSwap);
+ //reset particlesInBatch if we filled the simd width
+ memset(particlesInBatch, 0xff, sizeof(uint32_t)*particlesInBatchSize);
+ }
- if (tIt == tSwap)
- break; // no independent triplet found
+ TripletIter tSwap = tIt;
- // mark vertices to be used in simdIndex
- mark[tIt->x] = setIndex;
- mark[tIt->y] = setIndex;
- mark[tIt->z] = setIndex;
+ //Look for the next triplet that doesn't share indices with current batch
+ while(isIndexInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z))
+ {
+ tSwap++;
+ if(tSwap == tEnd)
+ break;
}
- if (tIt == tSwap) // remaining triplets depend on current set
+ if(tSwap == tEnd)
{
- if (setSize > simdWidth) // trim set to multiple of simdWidth
- {
- uint32_t overflow = setSize % simdWidth;
- setSize -= overflow;
- tIt -= overflow;
- }
+ //all remaining triplets are share indices with the current batch
+
+ //This doesn't make sense, as it will just introduce an extra set with setSize<simdWidth
+ //Should double check though
+ // // trim set to multiple of simdWidth with the hope that the trimmed triples
+ // // will fit in the next set with a multiple of simdWidth size
+ // if(setSize > (int)simdWidth)
+ // {
+ // uint32_t overflow = setSize % simdWidth;
+ // setSize -= overflow;
+ // tIt -= overflow;
+ // }
+
+ //finish set
mSetSizes.pushBack(setSize);
+ amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth); //last modulo avoids adding padding when setSize%simdWidth==0
setSize = 0;
+ tIt--; //start next set with current triplet
+ }
+ else
+ {
+ //add triplet to set
+ markIndicesInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z);
+ std::iter_swap(tIt, tSwap); //Place triplet at the end of the current set, so that they in sequence in mTriplets
+ setSize++;
+ }
+ }
+ if(setSize)
+ {
+ //finish last set, if we have one
+ mSetSizes.pushBack(setSize);
+ amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth)))% static_cast<int>(simdWidth);
+ }
+
+ // Padding code used to live in SwCloth::setVirtualParticles, now we do it here directly
+ mPaddedTriplets.reserve(mTriplets.size() + amountOfPaddingNeeded);
+
+ {
+ Vec4us paddingDummy(static_cast<uint16_t>(numParticles), static_cast<uint16_t>(numParticles) + 1, static_cast<uint16_t>(numParticles) + 2, 0);
+
+ TripletIter tIt = mTriplets.begin();
+ //TripletIter tEnd = mTriplets.end();
+ SetIter setIt = mSetSizes.begin();
+ SetIter setEnd = mSetSizes.end();
+ for(; setIt < setEnd; ++setIt)
+ {
+ for(unsigned int i = 0; i < *setIt; i++)
+ {
+ mPaddedTriplets.pushBack(*tIt);
+ ++tIt;
+ }
+ unsigned int padding = (static_cast<int>(simdWidth) - (*setIt % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth);
+ for(unsigned int i = 0; i < padding; i++)
+ {
+ mPaddedTriplets.pushBack(paddingDummy);
+ }
}
}
}
@@ -95,8 +182,10 @@ struct TripletSet
}
uint32_t mMark; // triplet index
- uint8_t mNumReplays[3];
- uint8_t mNumConflicts[3][32];
+
+ // [3] because each particle is fetched on a different instruction.
+ uint8_t mNumReplays[3]; //how many times the instruction needs to be executed to resolve all bank conflicts (= maxElement(mNumConflicts[3][0...31]))
+ uint8_t mNumConflicts[3][32]; //Number of accesses for each of the 32 banks (if it is 1 then there is no conflict on that bank)
};
/*
@@ -132,83 +221,147 @@ void prefixSum(Iter first, Iter last, Iter dest)
*dest = *(dest - 1) + *first;
}
}
-}
-// CUDA version
-void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+class AdjacencyQuerier
{
- // NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+private:
+ typedef nv::cloth::Vector<uint32_t>::Type UintVector;
+ typedef nv::cloth::Vector<uint32_t>::Type::Iterator UintVectorIterator;
+ typedef nv::cloth::Vector<nv::cloth::Vec4u>::Type::ConstIterator ConstTripletIter;
+ UintVector mAdjacencyIndecies;
+ UintVector mAdjacencies;
+ uint32_t mMaxAdjacentCount;
+
+public:
+ AdjacencyQuerier(nv::cloth::Vector<nv::cloth::Vec4u>::Type const& triplets, int particleCount)
+ {
+ {
+ UintVector& adjacencyCount = mAdjacencyIndecies; //use same memory as mAdjacencyIndecies
+ adjacencyCount.resize(particleCount+1, uint32_t(0)); //initialize count to 0. Size+1 used later for prefixSum/indices
- if (mTriplets.empty())
- return;
+ // count the number of triplets per particle
+ for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt)
+ {
+ for(int i = 0; i < 3; i++) //for each index in the triplet
+ {
+ const uint32_t particleIndex = (*tIt)[i];
+ adjacencyCount[particleIndex] += 1;
+ }
+ }
+ //adjacentcyCount[i] now holds the number of triplets referring to particle i
- TripletIter tIt, tEnd = mTriplets.end();
- uint32_t tripletIndex;
+ mMaxAdjacentCount = *physx::shdfnd::maxElement(adjacencyCount.begin(), adjacencyCount.end());
- // count number of triplets per particle
- Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0));
- for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt)
- for (int i = 0; i < 3; ++i)
- ++adjacentCount[(*tIt)[i]];
+ // compute in place prefix sum (inclusive)
+ prefixSum(adjacencyCount.begin(), adjacencyCount.end(), mAdjacencyIndecies.begin());
+ // we use mAdjacencyIndecies from now on
+ }
- /* neither of those were really improving number of batches:
- // run simd version to pre-sort particles
- simd(numParticles, blockWidth); mSetSizes.resize(0);
- // sort according to triplet degree (estimated by sum of adjacentCount)
- std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount));
- */
+ mAdjacencies.resize(mAdjacencyIndecies.back());
+ uint32_t tripletIndex = 0;
+ for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt, ++tripletIndex)
+ {
+ for(int i = 0; i < 3; i++) //for each index in the triplet
+ {
+ const uint32_t particleIndex = (*tIt)[i];
- uint32_t maxTripletCount = *shdfnd::maxElement(adjacentCount.begin(), adjacentCount.end());
+ //Decrement mAdjacencyIndecies here to convert it from inclusive to exclusive prefix sum
+ const uint32_t adjacencyIndex = --mAdjacencyIndecies[particleIndex];
- // compute in place prefix sum (inclusive)
- prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin());
+ mAdjacencies[adjacencyIndex] = tripletIndex;
+ }
+ }
+ //mAdjacencies[mAdjacencyIndecies[i]] now stores the first triplet index referring to particle i
+ //mAdjacencies[mAdjacencyIndecies[i+1]] stores one past the last
+ }
- // initialize adjacencies (for each particle, collect touching triplets)
- // also converts partial sum in adjacentCount from inclusive to exclusive
- Vector<uint32_t>::Type adjacencies(adjacentCount.back());
- for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
- for (int i = 0; i < 3; ++i)
- adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex;
+ uint32_t getMaxAdjacentCount() const
+ {
+ return mMaxAdjacentCount;
+ }
+
+ nv::cloth::Range<const uint32_t> getAdjacentTriplets(uint32_t particleIndex) const
+ {
+ //use .begin() + offset to get around bounds check
+ auto begin = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex]);
+ auto end = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex + 1]);
+ return nv::cloth::Range<const uint32_t>(begin, end);
+ }
+};
+
+}
+
+// CUDA version. Doesn't add padding, optimizes for bank conflicts
+void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+ // warpWidth has to be <= 32 and a power of two
+ NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+
+ if (mTriplets.empty())
+ return;
+
+ AdjacencyQuerier adjacencyQuerier(mTriplets, numParticles);
uint32_t warpMask = warpWidth - 1;
- uint32_t numSets = maxTripletCount; // start with minimum number of sets
+ uint32_t numSets = adjacencyQuerier.getMaxAdjacentCount(); // start with minimum number of sets
Vector<TripletSet>::Type sets(numSets);
+
+ // maps triplets to the set index that contains the triplet (setIndex = setIndices[tripletIndex])
Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1));
+
mSetSizes.resize(numSets);
+ uint32_t tripletIndex = 0;
+ TripletIter tIt, tEnd = mTriplets.end();
+
// color triplets (assign to sets)
- Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd;
- for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+ for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt, ++tripletIndex)
{
- // mark sets of adjacent triplets
+ // mark sets with adjacent triplets
for (int i = 0; i < 3; ++i)
{
uint32_t particleIndex = (*tIt)[i];
- aIt = aBegin + adjacentCount[particleIndex];
- aEnd = aBegin + adjacentCount[particleIndex + 1];
- for (uint32_t setIndex; aIt != aEnd; ++aIt)
- if (numSets > (setIndex = setIndices[*aIt]))
+ Range<const uint32_t> adjacentTriplets = adjacencyQuerier.getAdjacentTriplets(particleIndex);
+
+ //for all triplets adjacent to (sharing) particle 'particleIndex'
+ for (; adjacentTriplets.begin() != adjacentTriplets.end(); adjacentTriplets.popFront())
+ {
+ uint32_t setIndex;
+ if(numSets > (setIndex = setIndices[adjacentTriplets.front()]))
+ {
+ //the set 'setIndex' has an adjacent triplet in it (sharing at least one vertex with tripletIndex)
sets[setIndex].mMark = tripletIndex;
+ }
+ }
}
- // find valid set with smallest number of bank conflicts
+ // find valid set with smallest number of additional replays when adding triplet tIt
uint32_t bestIndex = numSets;
- uint32_t minReplays = 4;
- for (uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex)
+ uint32_t minAdditionalReplays = 4; // one more than the maximum possible (for 3 particles)
+ for (uint32_t setIndex = 0; setIndex < numSets && minAdditionalReplays; ++setIndex)
{
const TripletSet& set = sets[setIndex];
if (set.mMark == tripletIndex)
- continue; // triplet collision
+ continue; // triplet collision (this set contains an adjacent triplet)
+ // count additional replays caused by inserting triplet 'tIt' to in set 'setIndex'
uint32_t numReplays = 0;
- for (uint32_t i = 0; i < 3; ++i)
- numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]];
+ for(uint32_t i = 0; i < 3; ++i) //for each particle in the triplet
+ {
+ const uint32_t particleIndex = (*tIt)[i];
+ const uint32_t bank = warpMask & particleIndex;
- if (minReplays > numReplays)
+ // an additional replay will only be caused if the additional bank conflict is
+ // on a bank that is the current bottleneck
+ numReplays += set.mNumReplays[i] == set.mNumConflicts[i][bank];
+ }
+
+ // hold on to this set if it has less additional replays compared to our current best set
+ if (minAdditionalReplays > numReplays)
{
- minReplays = numReplays;
+ minAdditionalReplays = numReplays;
bestIndex = setIndex;
}
}
@@ -221,18 +374,26 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
++numSets;
}
- // increment bank conflicts or reset if warp filled
+ // increment mNumReplays or reset if warp filled
TripletSet& set = sets[bestIndex];
if (++mSetSizes[bestIndex] & warpMask)
+ {
for (uint32_t i = 0; i < 3; ++i)
- set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]);
+ {
+ const uint32_t particleIndex = (*tIt)[i];
+ const uint32_t bank = warpMask & particleIndex;
+ set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][bank]);
+ }
+ }
else
+ {
set = TripletSet();
+ }
setIndices[tripletIndex] = bestIndex;
}
- // reorder triplets
+ // reorder triplets so that the sets are in continuous memory
Vector<uint32_t>::Type setOffsets(mSetSizes.size());
prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin());
@@ -243,3 +404,47 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
mTriplets.swap(triplets);
}
+
+
+/* Interface used for unit tests */
+cloth::TripletSchedulerTestInterface::TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets)
+ :
+ mScheduler(new cloth::TripletScheduler(triplets))
+{}
+
+cloth::TripletSchedulerTestInterface::~TripletSchedulerTestInterface()
+{
+ delete mScheduler;
+}
+void cloth::TripletSchedulerTestInterface::simd(uint32_t numParticles, uint32_t simdWidth)
+{
+ mScheduler->simd(numParticles, simdWidth);
+}
+void cloth::TripletSchedulerTestInterface::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+ mScheduler->warp(numParticles, warpWidth);
+}
+
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetTriplets()
+{
+ return Range<const uint32_t>(
+ reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin()),
+ reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin() + mScheduler->mTriplets.size())
+ );
+}
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetSetSizes()
+{
+ return Range<const uint32_t>(
+ mScheduler->mSetSizes.begin(),
+ mScheduler->mSetSizes.begin() + mScheduler->mSetSizes.size()
+ );
+}
+
+NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]> triplets)
+{
+ return new nv::cloth::TripletSchedulerTestInterface(triplets);
+}
+NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface* interface)
+{
+ delete interface;
+}
diff --git a/NvCloth/src/TripletScheduler.h b/NvCloth/src/TripletScheduler.h
index 26119cf..81dc889 100644
--- a/NvCloth/src/TripletScheduler.h
+++ b/NvCloth/src/TripletScheduler.h
@@ -43,6 +43,7 @@ struct TripletScheduler
{
typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter;
typedef Vector<Vec4u>::Type::Iterator TripletIter;
+ typedef Vector<uint32_t>::Type::Iterator SetIter;
TripletScheduler(Range<const uint32_t[4]>);
void simd(uint32_t numParticles, uint32_t simdWidth);
@@ -50,6 +51,32 @@ struct TripletScheduler
Vector<Vec4u>::Type mTriplets;
Vector<uint32_t>::Type mSetSizes;
+ Vector<Vec4us>::Type mPaddedTriplets;
};
}
}
+
+//Make TripletScheduler available for the unit tests.
+namespace nv
+{
+namespace cloth
+{
+struct NV_CLOTH_IMPORT TripletSchedulerTestInterface
+{
+private:
+ TripletScheduler* mScheduler;
+
+public:
+ TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets);
+ ~TripletSchedulerTestInterface();
+ void simd(uint32_t numParticles, uint32_t simdWidth);
+ void warp(uint32_t numParticles, uint32_t warpWidth);
+
+ Range<const uint32_t> GetTriplets();
+ Range<const uint32_t> GetSetSizes();
+};
+}
+}
+
+NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]>);
+NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface*);
diff --git a/NvCloth/src/cuda/CuCloth.h b/NvCloth/src/cuda/CuCloth.h
index 8dc3082..f7d8268 100644
--- a/NvCloth/src/cuda/CuCloth.h
+++ b/NvCloth/src/cuda/CuCloth.h
@@ -131,7 +131,7 @@ class CuCloth : protected CuContextLock, public ClothImpl<CuCloth>
uint32_t getSharedMemorySize() const; // without particle data
// expects transformed configs, doesn't call notifyChanged()
- void setPhaseConfigInternal(Range<const PhaseConfig>);
+ void setPhaseConfigInternal(Range<const PhaseConfig> configs);
Range<physx::PxVec4> push(CuConstraints&);
void clear(CuConstraints&);
diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h
index aeb2bda..f9b69f7 100644
--- a/NvCloth/src/cuda/CuCollision.h
+++ b/NvCloth/src/cuda/CuCollision.h
@@ -1160,9 +1160,10 @@ __device__ void apply(ParticleDataT& particleData, const int4& indices, const fl
posX.y += delta.y * weights.x; posY.y += delta.y * weights.y; posZ.y += delta.y * weights.z;
posX.z += delta.z * weights.x; posY.z += delta.z * weights.y; posZ.z += delta.z * weights.z;
- particleData(indices.x) = posX;
- particleData(indices.y) = posY;
- particleData(indices.z) = posZ;
+ //if(particle is not static) store new position
+ if(particleData(indices.x,3)!= 0) particleData(indices.x) = posX;
+ if(particleData(indices.y,3)!= 0) particleData(indices.y) = posY;
+ if(particleData(indices.z,3)!= 0) particleData(indices.z) = posZ;
}
#else
template <typename PointerT>
diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp
index 957f912..6794fa5 100644
--- a/NvCloth/src/cuda/CuFabric.cpp
+++ b/NvCloth/src/cuda/CuFabric.cpp
@@ -165,7 +165,8 @@ uint32_t cloth::CuFabric::getNumStiffnessValues() const
uint32_t cloth::CuFabric::getNumSets() const
{
- return uint32_t(mSets.size() - 1);
+ return uint32_t(physx::PxMax(0, (int)mSets.size() - 1));
+
}
uint32_t cloth::CuFabric::getNumIndices() const
diff --git a/NvCloth/src/cuda/CuFactory.cpp b/NvCloth/src/cuda/CuFactory.cpp
index 397262c..df52dcd 100644
--- a/NvCloth/src/cuda/CuFactory.cpp
+++ b/NvCloth/src/cuda/CuFactory.cpp
@@ -97,6 +97,7 @@ void cloth::checkSuccessImpl(CUresult err, const char* file, const int line)
ADD_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
ADD_CASE(CUDA_ERROR_LAUNCH_TIMEOUT);
ADD_CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
+ ADD_CASE(CUDA_ERROR_ILLEGAL_ADDRESS);
default:
ADD_CASE(CUDA_ERROR_UNKNOWN);
#undef ADD_CASE
@@ -115,8 +116,9 @@ uint32_t getMaxThreadsPerBlock(CUcontext context)
CUdevice device;
checkSuccess(cuCtxGetDevice(&device));
- int major = 0;
+ int major = 0, minor = 0;
checkSuccess(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+ checkSuccess(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
checkSuccess(cuCtxPopCurrent(nullptr));
@@ -149,7 +151,7 @@ cloth::Fabric* cloth::CuFactory::createFabric(uint32_t numParticles, Range<const
Range<const float> tetherLengths, Range<const uint32_t> triangles)
{
return NV_CLOTH_NEW(CuFabric)(*this, numParticles, phaseIndices, sets, restvalues, stiffnessValues, indices, anchors, tetherLengths, triangles,
- getNextFabricId());
+ getNextFabricId());
}
cloth::Cloth* cloth::CuFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
@@ -210,6 +212,7 @@ void cloth::CuFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
if (!sets.empty())
{
+ // need to skip copying the first element
NV_CLOTH_ASSERT(sets.size() == cuFabric.mSets.size() - 1);
const uint32_t* deviceSets = cuFabric.mSets.begin().get();
copyToHost(deviceSets + 1, deviceSets + cuFabric.mSets.size(), sets.begin());
diff --git a/NvCloth/src/cuda/CuSolverKernelBlob.h b/NvCloth/src/cuda/CuSolverKernelBlob.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/NvCloth/src/cuda/CuSolverKernelBlob.h
diff --git a/NvCloth/src/dx/DxCloth.cpp b/NvCloth/src/dx/DxCloth.cpp
index 68c3289..0b1521b 100644
--- a/NvCloth/src/dx/DxCloth.cpp
+++ b/NvCloth/src/dx/DxCloth.cpp
@@ -84,9 +84,9 @@ cloth::DxCloth::DxCloth(DxFactory& factory, DxFabric& fabric, Range<const PxVec4
, mTargetCollisionPlanes(mFactory.mCollisionPlanes)
, mStartCollisionTriangles(mFactory.mCollisionTriangles)
, mTargetCollisionTriangles(mFactory.mCollisionTriangles)
-, mVirtualParticleSetSizes(mFactory.mContextManager)
-, mVirtualParticleIndices(mFactory.mContextManager)
-, mVirtualParticleWeights(mFactory.mContextManager)
+, mVirtualParticleSetSizes(mFactory.mVirtualParticleSetSizes)
+, mVirtualParticleIndices(mFactory.mVirtualParticleIndices)
+, mVirtualParticleWeights(mFactory.mVirtualParticleWeights)
, mRestPositions(mFactory.mRestPositions)
, mSelfCollisionIndices(mFactory.mSelfCollisionIndices)
, mSelfCollisionParticles(mFactory.mSelfCollisionParticles)
diff --git a/NvCloth/src/dx/DxCloth.h b/NvCloth/src/dx/DxCloth.h
index 5e783fa..3a04734 100644
--- a/NvCloth/src/dx/DxCloth.h
+++ b/NvCloth/src/dx/DxCloth.h
@@ -91,11 +91,9 @@ class DxCloth : protected DxContextLock, public ClothImpl<DxCloth>
typedef DxFabric FabricType;
typedef DxContextLock ContextLockType;
- template <typename>
- struct MapTraits;
-
typedef DxVectorMap<DxBatchedVector<physx::PxVec3> > MappedVec3fVectorType;
typedef DxVectorMap<DxBatchedVector<physx::PxVec4> > MappedVec4fVectorType;
+ typedef DxVectorMap<DxBatchedVector<Vec4us> > MappedVec4usVectorType;
typedef DxVectorMap<DxBatchedVector<IndexPair> > MappedIndexVectorType;
typedef DxVectorMap<DxBatchedVector<uint32_t> > MappedMaskVectorType;
@@ -190,9 +188,9 @@ class DxCloth : protected DxContextLock, public ClothImpl<DxCloth>
float mFriction;
// virtual particles
- DxDeviceVector<uint32_t> mVirtualParticleSetSizes;
- DxDeviceVector<Vec4us> mVirtualParticleIndices;
- DxDeviceVector<physx::PxVec4> mVirtualParticleWeights;
+ DxBatchedVector<uint32_t> mVirtualParticleSetSizes;
+ DxBatchedVector<Vec4us> mVirtualParticleIndices;
+ DxBatchedVector<physx::PxVec4> mVirtualParticleWeights;
// self collision
float mSelfCollisionDistance;
diff --git a/NvCloth/src/dx/DxClothData.cpp b/NvCloth/src/dx/DxClothData.cpp
index 57049bf..562ce7a 100644
--- a/NvCloth/src/dx/DxClothData.cpp
+++ b/NvCloth/src/dx/DxClothData.cpp
@@ -71,6 +71,8 @@ cloth::DxClothData::DxClothData(DxCloth& cloth)
mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3;
+ mNumVirtualParticleSetSizes = cloth.mVirtualParticleSetSizes.size();
+
mEnableContinuousCollision = cloth.mEnableContinuousCollision;
mCollisionMassScale = cloth.mCollisionMassScale;
mFrictionScale = cloth.mFriction;
diff --git a/NvCloth/src/dx/DxClothData.h b/NvCloth/src/dx/DxClothData.h
index 4da9be2..855a3c5 100644
--- a/NvCloth/src/dx/DxClothData.h
+++ b/NvCloth/src/dx/DxClothData.h
@@ -46,7 +46,7 @@ typedef unsigned int uint32_t;
typedef int int32_t;
#endif
-static const uint32_t MaxParticlesInSharedMem = 1971;
+static const uint32_t MaxParticlesInSharedMem = 1969;
struct DxPhaseConfig
@@ -124,6 +124,8 @@ struct DxClothData
uint32_t mNumCollisionTriangles;
+ uint32_t mNumVirtualParticleSetSizes;
+
uint32_t mEnableContinuousCollision; //bool stored in uint32_t for dx alignment
float mCollisionMassScale;
float mFrictionScale;
diff --git a/NvCloth/src/dx/DxFabric.cpp b/NvCloth/src/dx/DxFabric.cpp
index 56b2b0a..cf6865a 100644
--- a/NvCloth/src/dx/DxFabric.cpp
+++ b/NvCloth/src/dx/DxFabric.cpp
@@ -62,30 +62,33 @@ cloth::DxFabric::DxFabric(DxFactory& factory, uint32_t numParticles, Range<const
NV_CLOTH_ASSERT(restvalues.size() == stiffnessValues.size() || stiffnessValues.size() == 0);
NV_CLOTH_ASSERT(mNumParticles > *shdfnd::maxElement(indices.begin(), indices.end()));
- // manually convert uint32_t indices to uint16_t in temp memory
- Vector<DxConstraint>::Type hostConstraints;
- hostConstraints.resizeUninitialized(restvalues.size());
- Vector<DxConstraint>::Type::Iterator cIt = hostConstraints.begin();
- Vector<DxConstraint>::Type::Iterator cEnd = hostConstraints.end();
-
- const uint32_t* iIt = indices.begin();
- const float* rIt = restvalues.begin();
- for (; cIt != cEnd; ++cIt)
+ //constraints
{
- cIt->mRestvalue = *rIt++;
- cIt->mFirstIndex = uint16_t(*iIt++);
- cIt->mSecondIndex = uint16_t(*iIt++);
- }
-
- // copy to device vector in one go
+ // manually convert uint32_t indices to uint16_t in temp memory
+ Vector<DxConstraint>::Type hostConstraints;
+ hostConstraints.resizeUninitialized(restvalues.size());
+ Vector<DxConstraint>::Type::Iterator cIt = hostConstraints.begin();
+ Vector<DxConstraint>::Type::Iterator cEnd = hostConstraints.end();
+
+ const uint32_t* iIt = indices.begin();
+ const float* rIt = restvalues.begin();
+ for(; cIt != cEnd; ++cIt)
+ {
+ cIt->mRestvalue = *rIt++;
+ cIt->mFirstIndex = uint16_t(*iIt++);
+ cIt->mSecondIndex = uint16_t(*iIt++);
+ }
+
+ // copy to device vector in one go
#if 0
// Workaround for NvAPI SCG device updateSubresource size limit
- mConstraintsHostCopy.assign(hostConstraints.begin(), hostConstraints.end());
- mConstraints.resize(mConstraintsHostCopy.size());
- mConstraints = mConstraintsHostCopy;
+ mConstraintsHostCopy.assign(hostConstraints.begin(), hostConstraints.end());
+ mConstraints.resize(mConstraintsHostCopy.size());
+ mConstraints = mConstraintsHostCopy;
#else
- mConstraints.assign(hostConstraints.begin(), hostConstraints.end());
+ mConstraints.assign(hostConstraints.begin(), hostConstraints.end());
#endif
+ }
mStiffnessValues.assign(stiffnessValues.begin(), stiffnessValues.end());
diff --git a/NvCloth/src/dx/DxFactory.cpp b/NvCloth/src/dx/DxFactory.cpp
index 91f5125..7298ffe 100644
--- a/NvCloth/src/dx/DxFactory.cpp
+++ b/NvCloth/src/dx/DxFactory.cpp
@@ -96,6 +96,12 @@ typedef Vec4T<uint32_t> Vec4u;
, mCollisionPlanesDeviceCopy(mContextManager)
, mCollisionTriangles(mContextManager, DxStagingBufferPolicy())
, mCollisionTrianglesDeviceCopy(mContextManager)
+ , mVirtualParticleSetSizes(mContextManager, DxStagingBufferPolicy())
+ , mVirtualParticleSetSizesDeviceCopy(mContextManager)
+ , mVirtualParticleIndices(mContextManager, DxStagingBufferPolicy())
+ , mVirtualParticleIndicesDeviceCopy(mContextManager)
+ , mVirtualParticleWeights(mContextManager, DxStagingBufferPolicy())
+ , mVirtualParticleWeightsDeviceCopy(mContextManager)
, mMotionConstraints(mContextManager)
, mSeparationConstraints(mContextManager)
, mRestPositions(mContextManager, DxStagingBufferPolicy())
@@ -205,7 +211,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
if (!sets.empty())
{
- // need to skip copying the first element
+ // we don't skip first element here
NV_CLOTH_ASSERT(sets.size() == dxFabric.mSets.size());
memcpy(sets.begin(), dxFabric.mSets.begin(), sets.size() * sizeof(uint32_t));
}
@@ -390,8 +396,11 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
uint32_t numWeights = cloth.getNumVirtualParticleWeights();
Vector<PxVec4>::Type hostWeights(numWeights, PxVec4(0.0f));
- copyToHost(hostWeights.begin(), dxCloth.mVirtualParticleWeights.mBuffer.mBuffer, 0,
- hostWeights.size() * sizeof(PxVec4));
+ //copyToHost(hostWeights.begin(), dxCloth.mVirtualParticleWeights.mBuffer.mBuffer, 0,
+ // hostWeights.size() * sizeof(PxVec4));
+ NV_CLOTH_ASSERT(hostWeights.size() == dxCloth.mVirtualParticleWeights.size());
+ intrinsics::memCopy(hostWeights.begin(), DxCloth::MappedVec4fVectorType(const_cast<DxCloth&>(dxCloth).mVirtualParticleWeights).begin(),
+ destIndices.size() * sizeof(uint32_t));
// convert weights to Vec3f
PxVec3* destIt = reinterpret_cast<PxVec3*>(destWeights.begin());
@@ -408,8 +417,11 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
uint32_t numIndices = cloth.getNumVirtualParticles();
Vector<Vec4us>::Type hostIndices(numIndices);
- copyToHost(hostIndices.begin(), dxCloth.mVirtualParticleIndices.mBuffer.mBuffer, 0,
- hostIndices.size() * sizeof(Vec4us));
+ //copyToHost(hostIndices.begin(), dxCloth.mVirtualParticleIndices.mBuffer.mBuffer, 0,
+ // hostIndices.size() * sizeof(Vec4us));
+ NV_CLOTH_ASSERT(hostIndices.size() == dxCloth.mVirtualParticleIndices.size());
+ intrinsics::memCopy(hostIndices.begin(), DxCloth::MappedVec4usVectorType(const_cast<DxCloth&>(dxCloth).mVirtualParticleIndices).begin(),
+ destIndices.size() * sizeof(uint32_t));
// convert indices to 32 bit
Vec4u* destIt = reinterpret_cast<Vec4u*>(destIndices.begin());
diff --git a/NvCloth/src/dx/DxFactory.h b/NvCloth/src/dx/DxFactory.h
index 23d5382..d5f985f 100644
--- a/NvCloth/src/dx/DxFactory.h
+++ b/NvCloth/src/dx/DxFactory.h
@@ -35,6 +35,7 @@
#include "../IndexPair.h"
#include <foundation/PxVec4.h>
#include <foundation/PxVec3.h>
+#include "Vec4T.h"
#if _MSC_VER >= 1700
#pragma warning(disable : 4471)
@@ -81,8 +82,8 @@ protected:
virtual Cloth* clone(const Cloth& cloth);
- virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phaseIndices, Range<uint32_t> sets,
- Range<float> restvalues, Range<float> stiffnessValues, Range<uint32_t> indices, Range<uint32_t> anchors,
+ virtual void extractFabricData( const Fabric& fabric, Range<uint32_t> phaseIndices, Range<uint32_t> sets,
+ Range<float> restvalues, Range<float> stiffnessValues, Range<uint32_t> indices, Range<uint32_t> anchors,
Range<float> tetherLengths, Range<uint32_t> triangles) const;
virtual void extractCollisionData(const Cloth& cloth, Range<physx::PxVec4> spheres, Range<uint32_t> capsules,
@@ -145,6 +146,12 @@ protected:
DxBatchedStorage<physx::PxVec3> mCollisionTriangles;
DxBuffer<physx::PxVec3> mCollisionTrianglesDeviceCopy;
+ DxBatchedStorage<uint32_t> mVirtualParticleSetSizes;
+ DxBuffer<uint32_t> mVirtualParticleSetSizesDeviceCopy;
+ DxBatchedStorage<Vec4us> mVirtualParticleIndices;
+ DxBuffer<Vec4us> mVirtualParticleIndicesDeviceCopy;
+ DxBatchedStorage<physx::PxVec4> mVirtualParticleWeights;
+ DxBuffer<physx::PxVec4> mVirtualParticleWeightsDeviceCopy;
DxBatchedStorage<physx::PxVec4> mMotionConstraints;
DxBatchedStorage<physx::PxVec4> mSeparationConstraints;
@@ -157,6 +164,7 @@ protected:
DxBatchedStorage<uint32_t> mSelfCollisionData;
DxBatchedStorage<uint32_t> mTriangles;
+
};
}
}
diff --git a/NvCloth/src/dx/DxSolver.cpp b/NvCloth/src/dx/DxSolver.cpp
index 66a8d8f..21caa0b 100644
--- a/NvCloth/src/dx/DxSolver.cpp
+++ b/NvCloth/src/dx/DxSolver.cpp
@@ -350,6 +350,9 @@ void cloth::DxSolver::beginFrame()
mFactory.mConvexMasksDeviceCopy = mFactory.mConvexMasks.mBuffer;
mFactory.mCollisionPlanesDeviceCopy = mFactory.mCollisionPlanes.mBuffer;
mFactory.mCollisionTrianglesDeviceCopy = mFactory.mCollisionTriangles.mBuffer;
+ mFactory.mVirtualParticleSetSizesDeviceCopy = mFactory.mVirtualParticleSetSizes.mBuffer;
+ mFactory.mVirtualParticleIndicesDeviceCopy = mFactory.mVirtualParticleIndices.mBuffer;
+ mFactory.mVirtualParticleWeightsDeviceCopy = mFactory.mVirtualParticleWeights.mBuffer;
// mFactory.mParticleAccelerations = mFactory.mParticleAccelerationsHostCopy;
mFactory.mRestPositionsDeviceCopy = mFactory.mRestPositions.mBuffer;
}
@@ -369,7 +372,8 @@ void cloth::DxSolver::executeKernel()
{
context->CSSetShader(mFactory.mSolverKernelComputeShader, NULL, 0);
- ID3D11ShaderResourceView* resourceViews[18] = {
+ // Set shader StructuredBuffer registers t0-t20
+ ID3D11ShaderResourceView* resourceViews[21] = {
mClothData.mBuffer.resourceView(), /*mFrameData.mBuffer.resourceView()*/NULL,
mIterationData.mBuffer.resourceView(), mFactory.mPhaseConfigs.mBuffer.resourceView(),
mFactory.mConstraints.mBuffer.resourceView(), mFactory.mTethers.mBuffer.resourceView(),
@@ -382,10 +386,15 @@ void cloth::DxSolver::executeKernel()
mFactory.mRestPositionsDeviceCopy.resourceView(),
mFactory.mSelfCollisionIndices.mBuffer.resourceView(),
mFactory.mStiffnessValues.mBuffer.resourceView(),
- mFactory.mTriangles.mBuffer.resourceView()
+ mFactory.mTriangles.mBuffer.resourceView(),
+
+ mFactory.mVirtualParticleSetSizesDeviceCopy.resourceView(),
+ mFactory.mVirtualParticleIndicesDeviceCopy.resourceView(),
+ mFactory.mVirtualParticleWeightsDeviceCopy.resourceView()
};
- context->CSSetShaderResources(0, 18, resourceViews);
+ context->CSSetShaderResources(0, 21, resourceViews);
+ // Set shader RWStructuredBuffer registers u0-u4
ID3D11UnorderedAccessView* accessViews[4] = {
mFactory.mParticles.mBuffer.accessViewRaw(),
mFactory.mSelfCollisionParticles.mBuffer.accessView(),
@@ -398,10 +407,10 @@ void cloth::DxSolver::executeKernel()
context->CSSetShader(NULL, NULL, 0);
- ID3D11ShaderResourceView* resourceViewsNULL[17] = {
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+ ID3D11ShaderResourceView* resourceViewsNULL[21] = {
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
- context->CSSetShaderResources(0, 17, resourceViewsNULL);
+ context->CSSetShaderResources(0, 21, resourceViewsNULL);
ID3D11UnorderedAccessView* accessViewsNULL[4] = { NULL, NULL, NULL, NULL };
context->CSSetUnorderedAccessViews(0, 4, accessViewsNULL, NULL);
#if 0
diff --git a/NvCloth/src/dx/DxSolverKernel.hlsl b/NvCloth/src/dx/DxSolverKernel.hlsl
index 4d91f4b..9c02468 100644
--- a/NvCloth/src/dx/DxSolverKernel.hlsl
+++ b/NvCloth/src/dx/DxSolverKernel.hlsl
@@ -77,6 +77,10 @@ StructuredBuffer<float> bPerConstraintStiffness : register(t16);
//Note that the buffer actually contains uint16_t values
StructuredBuffer<uint32_t> bTriangles : register(t17);
+StructuredBuffer<int32_t> bVirtualParticleSetSizes : register(t18);
+StructuredBuffer<uint4> bVirtualParticleIndices : register(t19);
+StructuredBuffer<float4> bVirtualParticleWeights : register(t20);
+
groupshared DxClothData gClothData;
groupshared DxFrameData gFrameData;
groupshared DxIterationData gIterData;
@@ -864,7 +868,7 @@ void collideCapsules(IParticles curParticles, IParticles prevParticles, uint32_t
static const float gSkeletonWidth = (1.f - 0.2f) * (1.f - 0.2f) - 1.f;
-uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevAlpha, out float3 outDelta, out float3 outVelocity)
+uint32_t collideCapsules(float3 curPosAbsolute, float3 prevPosAbsolute, float alpha, float prevAlpha, out float3 outDelta, out float3 outVelocity)
{
outDelta = float3(0.0f, 0.0f, 0.0f);
outVelocity = float3(0.0f, 0.0f, 0.0f);
@@ -881,87 +885,87 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
{
IndexPair indices = bCapsuleIndices[capsuleOffset + i];
- // current
+ // load sphere data
float4 startSphere0 = bCollisionSpheres[startSphereOffset + indices.first];
float4 targetSphere0 = bCollisionSpheres[targetSphereOffset + indices.first];
float4 startSphere1 = bCollisionSpheres[startSphereOffset + indices.second];
float4 targetSphere1 = bCollisionSpheres[targetSphereOffset + indices.second];
- // prev
+ // interpolate sphere data to end of previous iteration
float4 prevSphere0 = lerp(startSphere0, targetSphere0, prevAlpha);
float4 prevSphere1 = lerp(startSphere1, targetSphere1, prevAlpha);
prevSphere0.w = max(prevSphere0.w, 0.0f);
prevSphere1.w = max(prevSphere1.w, 0.0f);
- float4 prevAxis = (prevSphere1 - prevSphere0) * 0.5f;
float3 prevConeCenter = (prevSphere1.xyz + prevSphere0.xyz) * 0.5f;
+ float4 prevHalfAxis = (prevSphere1 - prevSphere0) * 0.5f; //half axis
+ //axis.w = half of radii difference
- float3 prevDelta = prevPos - prevConeCenter;
+ float3 prevPos = prevPosAbsolute - prevConeCenter;
- float prevSqrAxisLength = dot(prevAxis.xyz, prevAxis.xyz);
- float prevSqrConeLength = prevSqrAxisLength - prevAxis.w * prevAxis.w;
+ float prevSqrAxisHalfLength = dot(prevHalfAxis.xyz, prevHalfAxis.xyz);
+ float prevSqrConeHalfLength = prevSqrAxisHalfLength - prevHalfAxis.w * prevHalfAxis.w;
- if (prevSqrAxisLength <= 0.0f)
+ if (prevSqrConeHalfLength <= 0.0f)
continue;
- float prevInvAxisLength = rsqrt(prevSqrAxisLength);
- float prevInvConeLength = rsqrt(prevSqrConeLength);
+ float prevInvAxisHalfLength = rsqrt(prevSqrAxisHalfLength);
+ float prevInvConeHalfLength = rsqrt(prevSqrConeHalfLength);
- float prevAxisLength = prevSqrAxisLength * prevInvAxisLength;
+ float prevAxisHalfLength = prevSqrAxisHalfLength * prevInvAxisHalfLength;
- float prevConeRadius = (prevAxis.w + prevSphere0.w) * prevInvConeLength * prevAxisLength;
+ float prevConeRadius = (prevHalfAxis.w + prevSphere0.w) * prevInvConeHalfLength * prevAxisHalfLength;
- float3 prevConeAxis = prevAxis.xyz * prevInvAxisLength;
- float prevConeSlope = prevAxis.w * prevInvConeLength;
+ float3 prevConeAxis = prevHalfAxis.xyz * prevInvAxisHalfLength;
+ float prevConeSlope = prevHalfAxis.w * prevInvConeHalfLength;
- float3 prevCross = cross(prevDelta, prevConeAxis);
+ float3 prevCross = cross(prevPos, prevConeAxis);
float prevDot = dot(prevPos, prevConeAxis);
- // current
- float4 sphere0 = lerp(startSphere0, targetSphere0, alpha);
- float4 sphere1 = lerp(startSphere1, targetSphere1, alpha);
+ // interpolate sphere data to end of current iteration
+ float4 curSphere0 = lerp(startSphere0, targetSphere0, alpha);
+ float4 curSphere1 = lerp(startSphere1, targetSphere1, alpha);
- sphere0.w = max(sphere0.w, 0.0f);
- sphere1.w = max(sphere1.w, 0.0f);
+ curSphere0.w = max(curSphere0.w, 0.0f);
+ curSphere1.w = max(curSphere1.w, 0.0f);
- float4 curAxis = (sphere1 - sphere0) * 0.5f;
- float3 curConeCenter = (sphere1.xyz + sphere0.xyz) * 0.5f;
+ float3 curConeCenter = (curSphere1.xyz + curSphere0.xyz) * 0.5f;
+ float4 curHalfAxis = (curSphere1 - curSphere0) * 0.5f; //half axis
+ //axis.w = half of radii difference
- float3 curDelta = curPos - curConeCenter;
+ float3 curPos = curPosAbsolute - curConeCenter;
- float sqrAxisLength = dot(curAxis.xyz, curAxis.xyz);
- float sqrConeLength = sqrAxisLength - curAxis.w * curAxis.w;
+ float curSqrAxisHalfLength = dot(curHalfAxis.xyz, curHalfAxis.xyz);
+ float curSqrConeHalfLength = curSqrAxisHalfLength - curHalfAxis.w * curHalfAxis.w;
- if (sqrConeLength <= 0.0f)
+ if(curSqrConeHalfLength <= 0.0f)
continue;
- float invAxisLength = rsqrt(sqrAxisLength);
- float invConeLength = rsqrt(sqrConeLength);
+ float curInvAxisHalfLength = rsqrt(curSqrAxisHalfLength);
+ float curInvConeHalfLength = rsqrt(curSqrConeHalfLength);
- float axisLength = sqrAxisLength * invAxisLength;
+ float curAxisHalfLength = curSqrAxisHalfLength * curInvAxisHalfLength;
- float3 coneCenter = (sphere1.xyz + sphere0.xyz) * 0.5f;
- float coneRadius = (curAxis.w + sphere0.w) * invConeLength * axisLength;
+ float curConeRadius = (curHalfAxis.w + curSphere0.w) * curInvConeHalfLength * curAxisHalfLength;
- float3 curConeAxis = curAxis.xyz * invAxisLength;
+ float3 curConeAxis = curHalfAxis.xyz * curInvAxisHalfLength;
- float3 curCross = cross(curDelta, curConeAxis);
+ float3 curCross = cross(curPos, curConeAxis);
float curDot = dot(curPos, curConeAxis);
float curSqrDistance = FLT_EPSILON + dot(curCross, curCross);
- float prevRadius = max(prevDot * prevConeSlope + coneRadius, 0.0f);
+ float prevRadius = max(prevDot * prevConeSlope + prevConeRadius, 0.0f);
- float curSlope = curAxis.w * invConeLength;
- float curRadius = max(curDot * curSlope + coneRadius, 0.0f);
+ float curConeSlope = curHalfAxis.w * curInvConeHalfLength;
+ float curRadius = max(curDot * curConeSlope + curConeRadius, 0.0f);
- float sine = curAxis.w * invAxisLength;
- float coneSqrCosine = 1.f - sine * sine;
- float curHalfLength = axisLength;
+ float sine = curHalfAxis.w * curInvAxisHalfLength;
+ float coneSqrCosine = 1.f - (sine * sine);
- float dotPrevPrev = dot(prevCross, prevCross) - prevCross.x * prevCross.x - prevRadius * prevRadius;
+ float dotPrevPrev = dot(prevCross, prevCross) - prevRadius * prevRadius;
float dotPrevCur = dot(prevCross, curCross) - prevRadius * curRadius;
float dotCurCur = curSqrDistance - curRadius * curRadius;
@@ -972,7 +976,7 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
// time of impact or 0 if prevPos inside cone
float toi = min(0.0f, halfB + sqrtD) / minusA;
- bool hasCollision = toi < 1.0f && halfB < sqrtD;
+ bool hasCollision = (toi < 1.0f) && (halfB < sqrtD);
// skip continuous collision if the (un-clamped) particle
// trajectory only touches the outer skin of the cone.
@@ -990,46 +994,29 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
// interpolate delta at toi
float3 pos = prevPos - delta * toi;
- // float axisLength = sqrAxisLength * invAxisLength;
-
- // float curHalfLength = axisLength; // ?
- float3 curConeAxis = curAxis.xyz * invAxisLength;
- float3 curScaledAxis = curAxis.xyz * curHalfLength;
-
- float4 prevAxis = (prevSphere1 - prevSphere0) * 0.5f;
- float3 prevConeCenter = (prevSphere1.xyz + prevSphere0.xyz) * 0.5f;
-
- float prevSqrAxisLength = dot(prevAxis.xyz, prevAxis.xyz);
- float prevSqrConeLength = prevSqrAxisLength - prevAxis.w * prevAxis.w;
-
- float prevInvAxisLength = rsqrt(prevSqrAxisLength);
- float prevInvConeLength = rsqrt(prevSqrConeLength);
-
- float prevAxisLength = prevSqrAxisLength * prevInvAxisLength;
- float prevConeRadius = (prevAxis.w + prevSphere0.w) * prevInvConeLength * prevAxisLength;
-
- float3 prevConeAxis = prevAxis.xyz * prevInvAxisLength;
-
- float prevHalfLength = prevAxisLength;
-
- float3 deltaScaledAxis = curScaledAxis - prevAxis.xyz * prevHalfLength;
+ float3 curScaledAxis = curConeAxis.xyz * curAxisHalfLength;
+ float3 deltaScaledAxis = curScaledAxis - prevConeAxis * prevAxisHalfLength;
float oneMinusToi = 1.0f - toi;
// interpolate axis at toi
float3 axis = curScaledAxis - deltaScaledAxis * oneMinusToi;
- float slope = prevConeSlope * oneMinusToi + curSlope * toi;
+ float slope = prevConeSlope * oneMinusToi + curConeSlope * toi;
- float sqrHalfLength = dot(axis, axis); // axisX * axisX + axisY * axisY + axisZ * axisZ;
+ float sqrHalfLength = dot(axis, axis);
float invHalfLength = rsqrt(sqrHalfLength);
+ // distance along toi cone axis (from center)
float dotf = dot(pos, axis) * invHalfLength;
+ //point line distance
float sqrDistance = dot(pos, pos) - dotf * dotf;
float invDistance = sqrDistance > 0.0f ? rsqrt(sqrDistance) : 0.0f;
+ //offset base to take slope in to account
float base = dotf + slope * sqrDistance * invDistance;
float scale = base * invHalfLength;
+ // use invHalfLength to map base from [-HalfLength, +HalfLength]=inside to [-1, +1] =inside
if (abs(scale) < 1.0f)
{
@@ -1037,99 +1024,49 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
// reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
- float minusK = sqrtD / (minusA * oneMinusToi);
+ float minusK = (oneMinusToi > FLT_EPSILON) ? sqrtD / (minusA * oneMinusToi) : 0.0f;
oneMinusToi = oneMinusToi / (1.f - minusK);
- curDelta += delta * oneMinusToi;
-
- curDot = dot(curDelta, curAxis.xyz);
- float curConeRadius = (curAxis.w + sphere0.w) * invConeLength * axisLength;
+ curPos += delta * oneMinusToi;
- curRadius = max(curDot * curSlope + curConeRadius, 0.0f); // Duplicate?
- curSqrDistance = dot(curDelta, curDelta) - curDot * curDot;
+ curDot = dot(curPos, curConeAxis.xyz);
+ curRadius = max(curDot * curConeSlope + curConeRadius, 0.0f);
+ curSqrDistance = dot(curPos, curPos) - curDot * curDot;
- curPos = coneCenter + curDelta;
+ //curPos is relative to coneCenter so we don't need to do this:
+ //curPos = coneCenter + curPos;
}
}
-
- {
- float3 delta = curPos - coneCenter;
-
- float deltaDotAxis = dot(delta, curConeAxis);
- float radius = max(deltaDotAxis * curSlope + coneRadius, 0.0f);
- float sqrDistance = dot(delta, delta) - deltaDotAxis * deltaDotAxis;
- if (sqrDistance > radius * radius)
+ {
+ if (curSqrDistance > curRadius * curRadius)
continue;
- sqrDistance = max(sqrDistance, FLT_EPSILON);
- float invDistance = rsqrt(sqrDistance);
+ curSqrDistance = max(curSqrDistance, FLT_EPSILON);
+ float invDistance = rsqrt(curSqrDistance);
- float base = deltaDotAxis + curSlope * sqrDistance * invDistance;
- float halfLength = axisLength;
- // float halfLength = coneHalfLength;
+ float base = curDot + curConeSlope * curSqrDistance * invDistance;
+ float halfLength = curAxisHalfLength;
if (abs(base) < halfLength)
{
- delta = delta - base * curAxis;
+ float3 delta = curPos - base * curConeAxis;
float sqrCosine = coneSqrCosine;
- float scale = radius * invDistance * sqrCosine - sqrCosine;
+ float scale = curRadius * invDistance * sqrCosine - sqrCosine;
- outDelta += delta * scale;
+ outDelta += delta *scale;
if (frictionEnabled)
{
- // get previous sphere pos
- float4 prevSphere0 = lerp(startSphere0, targetSphere0, prevAlpha);
- float4 prevSphere1 = lerp(startSphere1, targetSphere1, prevAlpha);
-
// interpolate velocity between the two spheres
- float t = deltaDotAxis * 0.5f + 0.5f;
- outVelocity += lerp(sphere0.xyz - prevSphere0.xyz, sphere1.xyz - prevSphere1.xyz, t);
+ float t = curDot * 0.5f + 0.5f;
+ outVelocity += lerp(curSphere0.xyz - prevSphere0.xyz, curSphere1.xyz - prevSphere1.xyz, t);
}
++numCollisions;
}
}
-
- // curPos inside cone (discrete collision)
- bool hasContact = curRadius * curRadius > curSqrDistance;
-
- if (!hasContact)
- continue;
-
- float invDistance = curSqrDistance > 0.0f ? rsqrt(curSqrDistance) : 0.0f;
- float base = curDot + curSlope * curSqrDistance * invDistance;
-
- // float axisLength = sqrAxisLength * invAxisLength;
-
- // float halfLength = axisLength; // ?
- // float halfLength = coneHalfLength;
- /*
- if (abs(base) < halfLength)
- {
- float3 delta = curPos - base * curAxis.xyz;
-
- float sine = axis.w * invAxisLength; // Remove?
- float sqrCosine = 1.f - sine * sine;
-
- float scale = curRadius * invDistance * coneSqrCosine - coneSqrCosine;
-
-
- // delta += de
-
- outDelta += delta * scale;
-
- if (frictionEnabled)
- {
- // interpolate velocity between the two spheres
- float t = curDot * 0.5f + 0.5f;
- outVelocity += lerp(sphere0.xyz - prevSphere0.xyz, sphere1.xyz - prevSphere1.xyz, t);
- }
-
- ++numCollisions;
- }*/
}
// sphere collision
@@ -1147,8 +1084,8 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
float prevRadius = prevSphere.w;
{
- float3 curDelta = curPos - sphere.xyz;
- float3 prevDelta = prevPos - prevSphere.xyz;
+ float3 curDelta = curPosAbsolute - sphere.xyz;
+ float3 prevDelta = prevPosAbsolute - prevSphere.xyz;
float sqrDistance = FLT_EPSILON + dot(curDelta, curDelta);
@@ -1186,7 +1123,7 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
oneMinusToi = oneMinusToi / (1.f - minusK);
curDelta += delta * oneMinusToi;
- curPos = sphere.xyz + curDelta;
+ curPosAbsolute = sphere.xyz + curDelta;
sqrDistance = FLT_EPSILON + dot(curDelta, curDelta);
}
@@ -1212,6 +1149,125 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
return numCollisions;
}
+float3 barycentricLerp(float3 pA, float3 pB, float3 pC, float3 weights)
+{
+ float3 outP;
+ outP.x = pA.x * weights.x + pB.x * weights.y + pC.x * weights.z;
+ outP.y = pA.y * weights.x + pB.y * weights.y + pC.y * weights.z;
+ outP.z = pA.z * weights.x + pB.z * weights.y + pC.z * weights.z;
+ return outP;
+}
+
+void applyVirtualImpulse(IParticles particles, uint3 indices, float3 weights, float3 delta, float scale)
+//apply
+{
+ float4 pA = particles.get(indices.x);
+ float4 pB = particles.get(indices.y);
+ float4 pC = particles.get(indices.z);
+
+ delta *= scale;
+
+ pA.x += delta.x * weights.x; pB.x += delta.x * weights.y; pC.x += delta.x * weights.z;
+ pA.y += delta.y * weights.x; pB.y += delta.y * weights.y; pC.y += delta.y * weights.z;
+ pA.z += delta.z * weights.x; pB.z += delta.z * weights.y; pC.z += delta.z * weights.z;
+
+ //if(particle is not static) store new position
+ if(pA.w != 0.0f) particles.set(indices.x, pA);
+ if(pB.w != 0.0f) particles.set(indices.y, pB);
+ if(pC.w != 0.0f) particles.set(indices.z, pC);
+}
+
+void collideVirtualCapsules(IParticles curParticles, IParticles prevParticles, uint32_t threadIdx, float alpha, float prevAlpha)
+{
+ uint virtualParticleSetSizesCount = gClothData.mNumVirtualParticleSetSizes;
+
+ if(virtualParticleSetSizesCount == 0)
+ return;
+
+ bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+ bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+ if(gClothData.mEnableContinuousCollision)
+ {
+ //???
+ }
+
+ uint indicesEnd = 0;
+ for(uint vpSetSizeIt = 0; vpSetSizeIt < virtualParticleSetSizesCount; vpSetSizeIt++)
+ {
+ GroupMemoryBarrierWithGroupSync();
+ uint indicesIt = indicesEnd + threadIdx * 4;
+ for(indicesEnd += bVirtualParticleSetSizes[vpSetSizeIt]*4; indicesIt < indicesEnd; indicesIt += blockDim * 4)
+ {
+ uint4 indices;
+ {
+ uint4 tmp = bVirtualParticleIndices[indicesIt>>1];
+ if(indicesIt & 1 == 1)
+ {
+ indices.x = tmp.z & 0xffff;
+ indices.y = tmp.z >> 16;
+ indices.z = tmp.w & 0xffff;
+ indices.w = tmp.w >> 16;
+ }
+ else
+ {
+ indices.x = tmp.x & 0xffff;
+ indices.y = tmp.x >> 16;
+ indices.z = tmp.y & 0xffff;
+ indices.w = tmp.y >> 16;
+ }
+ }
+
+ float4 weights = bVirtualParticleWeights[indices.w];
+
+ float3 curPos = barycentricLerp(curParticles.get(indices.x).xyz, curParticles.get(indices.y).xyz, curParticles.get(indices.z).xyz, weights.xyz);
+
+ float3 delta, velocity;
+ uint32_t numCollisions = collideCapsules(curPos.xyz, alpha, prevAlpha, delta, velocity);
+ if(numCollisions > 0)
+ {
+ float scale = 1.0f / (float)numCollisions;
+ float wscale = weights.w * scale;
+
+ applyVirtualImpulse(curParticles, indices.xyz, weights.xyz, delta, scale);
+
+ if(frictionEnabled)
+ {
+ float3 prevPos = barycentricLerp(prevParticles.get(indices.x).xyz, prevParticles.get(indices.y).xyz, prevParticles.get(indices.z).xyz, weights.xyz);
+
+ float3 frictionImpulse =
+ calcFrictionImpulse(prevPos.xyz, curPos.xyz, velocity, scale, delta);
+
+ applyVirtualImpulse(prevParticles, indices.xyz, weights.xyz, frictionImpulse, -weights.w);
+ }
+
+ if(massScaleEnabled)
+ {
+ float deltaLengthSq = dot(delta.xyz, delta.xyz) * scale * scale;
+ float invMassScale = 1.0f / (1.0f + gClothData.mCollisionMassScale * deltaLengthSq);
+
+ // not multiplying by weights[3] here because unlike applying velocity
+ // deltas where we want the interpolated position to obtain a particular
+ // value, we instead just require that the total change is equal to invMassScale
+ invMassScale = invMassScale - 1.0f;
+
+ float4 pA = curParticles.get(indices.x);
+ float4 pB = curParticles.get(indices.y);
+ float4 pC = curParticles.get(indices.z);
+
+ pA.w *= 1.0f + weights.x * invMassScale;
+ pB.w *= 1.0f + weights.y * invMassScale;
+ pC.w *= 1.0f + weights.z * invMassScale;
+
+ curParticles.set(indices.x,pA);
+ curParticles.set(indices.y,pB);
+ curParticles.set(indices.z,pC);
+ }
+ }
+ }
+ }
+}
+
void collideContinuousCapsules(IParticles curParticles, IParticles prevParticles, uint32_t threadIdx, float alpha, float prevAlpha)
{
bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
@@ -1261,6 +1317,7 @@ void collideParticles(IParticles curParticles, IParticles prevParticles, uint32_
collideContinuousCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha);
else
collideCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha);
+ collideVirtualCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha);
}
void constrainSeparation(IParticles curParticles, uint32_t threadIdx, float alpha)
diff --git a/NvCloth/src/scalar/SwCollisionHelpers.h b/NvCloth/src/scalar/SwCollisionHelpers.h
index 3ab756f..c86d939 100644
--- a/NvCloth/src/scalar/SwCollisionHelpers.h
+++ b/NvCloth/src/scalar/SwCollisionHelpers.h
@@ -69,13 +69,17 @@ struct Gather<Scalar4i>
Gather<Scalar4i>::Gather(const Scalar4i& index)
{
+ //index are grid positions
+
uint32_t mask = /* sGridSize */ 8 - 1;
+ // Get grid index (forced within range)
mIndex.u4[0] = index.u4[0] & mask;
mIndex.u4[1] = index.u4[1] & mask;
mIndex.u4[2] = index.u4[2] & mask;
mIndex.u4[3] = index.u4[3] & mask;
+ // true (filled with all ones = -1) when gridSize <= index || index < 0
mOutOfRange.i4[0] = index.u4[0] & ~mask ? 0 : -1;
mOutOfRange.i4[1] = index.u4[1] & ~mask ? 0 : -1;
mOutOfRange.i4[2] = index.u4[2] & ~mask ? 0 : -1;
@@ -84,6 +88,7 @@ Gather<Scalar4i>::Gather(const Scalar4i& index)
Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const
{
+ //ptr points to the cone/sphere grid
const int32_t* base = ptr->i4;
const int32_t* index = mIndex.i4;
const int32_t* mask = mOutOfRange.i4;
diff --git a/NvCloth/src/sse2/SwCollisionHelpers.h b/NvCloth/src/sse2/SwCollisionHelpers.h
index b759868..4882818 100644
--- a/NvCloth/src/sse2/SwCollisionHelpers.h
+++ b/NvCloth/src/sse2/SwCollisionHelpers.h
@@ -76,22 +76,36 @@ Simd4i horizontalOr(const Simd4i& mask)
Gather<Simd4i>::Gather(const Simd4i& index)
{
- mSelectQ = _mm_srai_epi32(index << 29, 31);
- mSelectD = _mm_srai_epi32(index << 30, 31);
- mSelectW = _mm_srai_epi32(index << 31, 31);
- mOutOfRange = (index ^ sIntSignBit) > sSignedMask;
+ //index are grid positions
+ //grid has 8 spaces, grid indices that are within rage use 3 bits (0x7 mask)
+ mSelectQ = _mm_srai_epi32(index << 29, 31); //expand bit 0x4 to the whole register
+ mSelectD = _mm_srai_epi32(index << 30, 31); //expand bit 0x2 to the whole register
+ mSelectW = _mm_srai_epi32(index << 31, 31); //expand the least significant bit (0x1) to the whole register
+ mOutOfRange = (index ^ sIntSignBit) > sSignedMask; //true if index is outside the grid = (index > 0x7 || index < 0x0)
}
Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
{
+ //ptr points to the cone/sphere grid
+
// more efficient with _mm_shuffle_epi8 (SSSE3)
+
+ //lo contains the bottom 4 grid cells, heigh the top 4
Simd4i lo = ptr[0], hi = ptr[1];
+
+ //select correct cell in the grid using binary decision tree
+ // m = select ? a : b
+ //first bit (even/uneven cells)
Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo));
Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo));
Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi));
Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi));
+
+ //second bit
Simd4i m0123 = select(mSelectD, m23, m01);
Simd4i m4567 = select(mSelectD, m67, m45);
+
+ // third bit force result zero if it is out of range
return select(mSelectQ, m4567, m0123) & ~mOutOfRange;
}