diff options
| author | Marijn Tamis <[email protected]> | 2018-05-03 18:22:48 +0200 |
|---|---|---|
| committer | Marijn Tamis <[email protected]> | 2018-05-03 18:22:48 +0200 |
| commit | ca32c59a58d37c1822e185a2d5f3d0d3e8943593 (patch) | |
| tree | b06b9eec03f34344ef8fc31aa147b2714d3962ee /NvCloth/src | |
| parent | Forced rename of platform folders in cmake dir. Git didn't pick this up before. (diff) | |
| download | nvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.tar.xz nvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.zip | |
NvCloth 1.1.4 Release. (24070740)
Diffstat (limited to 'NvCloth/src')
29 files changed, 768 insertions, 361 deletions
diff --git a/NvCloth/src/ClothBase.h b/NvCloth/src/ClothBase.h index ec1ee40..faca6c9 100644 --- a/NvCloth/src/ClothBase.h +++ b/NvCloth/src/ClothBase.h @@ -64,6 +64,7 @@ void initialize(Cloth& cloth, const physx::PxVec4* pIt, const physx::PxVec4* pEn cloth.mCurrentMotion = physx::PxTransform(physx::PxIdentity); cloth.mLinearVelocity = physx::PxVec3(0.0f); cloth.mAngularVelocity = physx::PxVec3(0.0f); + cloth.mIgnoreVelocityDiscontinuityNextFrame = false; cloth.mPrevIterDt = 0.0f; cloth.mIterDtAvg = MovingAverage(30); cloth.mTetherConstraintLogStiffness = float(-FLT_MAX_EXP); diff --git a/NvCloth/src/ClothImpl.h b/NvCloth/src/ClothImpl.h index 24f7732..9bc4c36 100644 --- a/NvCloth/src/ClothImpl.h +++ b/NvCloth/src/ClothImpl.h @@ -33,6 +33,8 @@ #include "NvCloth/Fabric.h" #include <foundation/PxVec4.h> #include <foundation/PxVec3.h> +#include "IndexPair.h" +#include "MovingAverage.h" #include <PsMathUtils.h> #include <cmath> @@ -77,6 +79,9 @@ class ClothImpl : public Cloth virtual void clearInertia(); virtual void teleport(const physx::PxVec3& delta); + virtual void teleportToLocation(const physx::PxVec3& translation, const physx::PxQuat& rotation); + + virtual void ignoreVelocityDiscontinuity(); virtual float getPreviousIterationDt() const; virtual void setGravity(const physx::PxVec3& gravity); @@ -210,6 +215,7 @@ public: //Fields shared between all cloth classes physx::PxTransform mCurrentMotion; physx::PxVec3 mLinearVelocity; physx::PxVec3 mAngularVelocity; + bool mIgnoreVelocityDiscontinuityNextFrame; float mPrevIterDt; MovingAverage mIterDtAvg; @@ -292,6 +298,21 @@ inline void ClothImpl<T>::teleport(const physx::PxVec3& delta) } template <typename T> +inline void ClothImpl<T>::teleportToLocation(const physx::PxVec3& translation, const physx::PxQuat& rotation) +{ + mCurrentMotion.p = translation; + mCurrentMotion.q = rotation; + mTargetMotion.p = translation; + mTargetMotion.q = rotation; +} + +template <typename T> +inline void ClothImpl<T>::ignoreVelocityDiscontinuity() +{ + mIgnoreVelocityDiscontinuityNextFrame = true; +} + +template <typename T> inline float ClothImpl<T>::getPreviousIterationDt() const { return mPrevIterDt; diff --git a/NvCloth/src/IterationState.h b/NvCloth/src/IterationState.h index e18b636..be046b5 100644 --- a/NvCloth/src/IterationState.h +++ b/NvCloth/src/IterationState.h @@ -183,11 +183,15 @@ cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameD mPrevLinearVelocity = cloth.mLinearVelocity; mPrevAngularVelocity = cloth.mAngularVelocity; - // update cloth - float invFrameDt = 1.0f / frameDt; - cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p); - physx::PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate(); - cloth.mAngularVelocity = log(dq) * invFrameDt; + if(!cloth.mIgnoreVelocityDiscontinuityNextFrame) + { + // update cloth + float invFrameDt = 1.0f / frameDt; + cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p); + physx::PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate(); + cloth.mAngularVelocity = log(dq) * invFrameDt; + } + cloth.mIgnoreVelocityDiscontinuityNextFrame = false; cloth.mPrevIterDt = mIterDt; cloth.mIterDtAvg.push(static_cast<uint32_t>(mNumIterations), mIterDt); diff --git a/NvCloth/src/PhaseConfig.cpp b/NvCloth/src/PhaseConfig.cpp index 7ea97f3..b5db3b0 100644 --- a/NvCloth/src/PhaseConfig.cpp +++ b/NvCloth/src/PhaseConfig.cpp @@ -30,6 +30,7 @@ #include "NvCloth/PhaseConfig.h" #include "PsMathUtils.h" #include <algorithm> +#include "ClothImpl.h" using namespace physx; @@ -43,14 +44,6 @@ PhaseConfig transform(const PhaseConfig&); using namespace nv; -namespace -{ -float safeLog2(float x) -{ - float saturated = std::max(0.0f, std::min(x, 1.0f)); - return saturated ? shdfnd::log2(saturated) : -FLT_MAX_EXP; -} -} // convert from user input to solver format cloth::PhaseConfig cloth::transform(const PhaseConfig& config) diff --git a/NvCloth/src/SwCloth.cpp b/NvCloth/src/SwCloth.cpp index 6077ada..46a7784 100644 --- a/NvCloth/src/SwCloth.cpp +++ b/NvCloth/src/SwCloth.cpp @@ -269,28 +269,13 @@ void SwCloth::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const // shuffle indices to form independent SIMD sets uint16_t numParticles = uint16_t(mCurParticles.size()); - TripletScheduler scheduler(indices); + TripletScheduler scheduler(indices); //the TripletScheduler makes a copy so indices is not modified scheduler.simd(numParticles, 4); - // convert indices to byte offset - Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0); - Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin(); - Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end(); - TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast; - mVirtualParticleIndices.resize(0); - mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt)); - for (; sIt != sEnd; ++sIt) - { - uint32_t setSize = *sIt; - for (tLast = tIt + setSize; tIt != tLast; ++tIt, ++mNumVirtualParticles) - mVirtualParticleIndices.pushBack(Vec4us(*tIt)); - mVirtualParticleIndices.resize((mVirtualParticleIndices.size() + 3) & ~3, dummy); - } - Vector<Vec4us>::Type(mVirtualParticleIndices.begin(), mVirtualParticleIndices.end()) - .swap(mVirtualParticleIndices); + mVirtualParticleIndices.swap(scheduler.mPaddedTriplets); // precompute 1/dot(w,w) - Vector<PxVec4>::Type().swap(mVirtualParticleWeights); + Vector<PxVec4>::Type().swap(mVirtualParticleWeights); //clear and trim mVirtualParticleWeights.reserve(weights.size()); for (; !weights.empty(); weights.popFront()) { diff --git a/NvCloth/src/SwClothData.cpp b/NvCloth/src/SwClothData.cpp index f102bde..84d85d4 100644 --- a/NvCloth/src/SwClothData.cpp +++ b/NvCloth/src/SwClothData.cpp @@ -73,6 +73,7 @@ cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric) mTethers = fabric.mTethers.begin(); mNumTethers = uint32_t(fabric.mTethers.size()); + //1-(1 - stiffness)^stiffnessExponent mTetherConstraintStiffness = 1.0f - expf(stiffnessExponent * cloth.mTetherConstraintLogStiffness); mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale; diff --git a/NvCloth/src/SwCollision.cpp b/NvCloth/src/SwCollision.cpp index 0aa196d..bd6cf7a 100644 --- a/NvCloth/src/SwCollision.cpp +++ b/NvCloth/src/SwCollision.cpp @@ -167,7 +167,7 @@ void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, PxVec4 center = (second + first) * 0.5f; PxVec4 axis = (second - first) * 0.5f; //half axis - //axiw.w = half of radii difference + //axis.w = half of radii difference // |Axis|^2 float sqrAxisHalfLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z; @@ -426,8 +426,8 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt) T4f radius = splat<3>(sphere); //calculate the first and last cell index, for each axis, that contains the sphere - T4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero)); - T4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength)); + T4i first = intFloor(min(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero), sGridLength)); //use both min and max to deal with bad grid scales + T4i last = intFloor(min(max((sphere + radius) * mGridScale + mGridBias, gSimd4fZero), sGridLength)); const int* firstIdx = array(first); const int* lastIdx = array(last); @@ -555,40 +555,47 @@ operator &= (const ShapeMask& right) return *this; } +// get collision shape masks from a single cell from a single axis of the acceleration structure template <typename T4f> FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask cloth::SwCollision<T4f>::getShapeMask(const T4f& position, const T4i* __restrict sphereGrid, const T4i* __restrict coneGrid) { + // position are the grid positions along a single axis (x, y, or z) Gather<T4i> gather(intFloor(position)); + // get the bitmask indicating which cones/spheres overlap the grid cell ShapeMask result; result.mCones = gather(coneGrid); result.mSpheres = gather(sphereGrid); return result; } -// lookup acceleration structure and return mask of potential intersectors +// lookup acceleration structure and return mask of potential intersection collision shapes template <typename T4f> FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict positions) const { + // positions are the particle positions T4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias); T4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias); T4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias); - ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid); - result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2); - result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4); + // AND together the bit masks so only the cones/spheres remain + // that overlap with the particle posision on all axis + ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid); // X + result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2); // Y + result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4); // Z return result; } -// lookup acceleration structure and return mask of potential intersectors +// lookup acceleration structure and return mask of potential intersection collision shapes for CCD template <typename T4f> FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict prevPos, const T4f* __restrict curPos) const { + // same as getShapeMask(const T4f* __restrict positions) but for continuous collision detection T4f scaleX = splat<0>(mGridScale); T4f scaleY = splat<1>(mGridScale); T4f scaleZ = splat<2>(mGridScale); @@ -605,22 +612,24 @@ cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict prevPos, const T4f* T4f curY = curPos[1] * scaleY + biasY; T4f curZ = curPos[2] * scaleZ + biasZ; + // get maximum extent corner of the AABB containing both prevPos and curPos T4f maxX = min(max(prevX, curX), sGridLength); T4f maxY = min(max(prevY, curY), sGridLength); T4f maxZ = min(max(prevZ, curZ), sGridLength); - ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid); - result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2); - result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4); + ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid); // X + result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2); // Y + result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4); // Z + // get min extent corner of the AABB containing both prevPos and curPos T4f zero = gSimd4fZero; T4f minX = max(min(prevX, curX), zero); T4f minY = max(min(prevY, curY), zero); T4f minZ = max(min(prevZ, curZ), zero); - result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6); - result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8); - result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10); + result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6); // X + result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8); // Y + result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10); // Z return result; } @@ -770,7 +779,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc T4f axisZ = splat<2>(axis); T4f slope = splat<3>(axis); - // project delta onto axis + // distance along cone axis (from center) T4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; // interpolate radius T4f radius = dot * slope + splat<3>(center); @@ -901,7 +910,7 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask, T4f dotCurCur = sqrDistance - curRadius * curRadius; T4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; - T4f sqrtD = sqrt(discriminant); + T4f sqrtD = sqrt(discriminant); //we get -nan if there are no roots T4f halfB = dotPrevCur - dotPrevPrev; T4f minusA = dotPrevCur - dotCurCur + halfB; @@ -914,7 +923,7 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask, T4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth); - // a is negative when one sphere is contained in the other, + // a is negative when one relative sphere is contained in the other, // which is already handled by discrete collision. collisionMask = collisionMask & (minusA < -static_cast<T4f>(gSimd4fEpsilon)); @@ -931,10 +940,15 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask, T4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon); oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK); + + //Move curXYZ to toi point curX = curX + deltaX * oneMinusToi; curY = curY + deltaY * oneMinusToi; curZ = curZ + deltaZ * oneMinusToi; + //curXYZ is now touching the sphere at toi + //Note that curXYZ is also relative to the sphere center at toi + //We assume that the point sticks to the sphere until the end of the frame curPos[0] = splat<0>(curSphere) + curX; curPos[1] = splat<1>(curSphere) + curY; curPos[2] = splat<2>(curSphere) + curZ; @@ -1000,7 +1014,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest T4f prevT = prevY * prevAxisZ - prevZ * prevAxisY; T4f prevU = prevZ * prevAxisX - prevX * prevAxisZ; T4f prevV = prevX * prevAxisY - prevY * prevAxisX; - T4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; + T4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; //distance along the axis T4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter); T4f curCenter = loadAligned(curCenterPtr, offset); @@ -1014,18 +1028,22 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest T4f curX = curPos[0] - splat<0>(curCenter); T4f curY = curPos[1] - splat<1>(curCenter); T4f curZ = curPos[2] - splat<2>(curCenter); + //curTUV = cross(curXYZ, curAxisXYZ) T4f curT = curY * curAxisZ - curZ * curAxisY; T4f curU = curZ * curAxisX - curX * curAxisZ; T4f curV = curX * curAxisY - curY * curAxisX; T4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; T4f curRadius = curDot * curSlope + splat<3>(curCenter); + //Magnitude of cross product gives area of parallelogram |curAxisXYZ|*parallelogramHeight, |curAxisXYZ|=1 + //parallelogramHeight is distance between the axis and the point T4f curSqrDistance = gSimd4fEpsilon + curT * curT + curU * curU + curV * curV; // set radius to zero if cone is culled prevRadius = max(prevRadius, gSimd4fZero) & ~culled; curRadius = max(curRadius, gSimd4fZero) & ~culled; + //Use quadratic equation to solve for time of impact (against infinite cone) T4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius; T4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius; T4f dotCurCur = curSqrDistance - curRadius * curRadius; @@ -1060,6 +1078,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest T4f posY = prevY - deltaY * toi; T4f posZ = prevZ - deltaZ * toi; + // axisHalfLength T4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary)); T4i prevAuxiliary = loadAligned(prevAuxiliaryPtr, offset); T4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary)); @@ -1074,19 +1093,26 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest T4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ; T4f invHalfLength = rsqrt(sqrHalfLength); + // distance along toi cone axis (from center) T4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength; + //point line distance T4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot; T4f invDistance = rsqrt(sqrDistance) & (sqrDistance > gSimd4fZero); + //offset base to take slope in to account T4f base = dot + slope * sqrDistance * invDistance; T4f scale = base * invHalfLength & collisionMask; + // use invHalfLength to map base from [-HalfLength, +HalfLength]=inside to [-1, +1] =inside + // test if any impact position is in cone section T4f cullMask = (abs(scale) < gSimd4fOne) & collisionMask; // test if any impact position is in cone section if (!allEqual(cullMask, gSimd4fZero)) { + //calculate unnormalized normal delta? + //delta = prev - cur - (prevAxis - curScaledAxis)*scale deltaX = deltaX + splat<0>(deltaScaledAxis) * scale; deltaY = deltaY + splat<1>(deltaScaledAxis) * scale; deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale; @@ -1099,15 +1125,19 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest T4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon); oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK); - curX = curX + deltaX * oneMinusToi; + //curX = cur + (prev - cur - (prevAxis - curScaledAxis)*scale)*(1-toi) + //curX = cur + (prev - cur)*(1-toi) - ((prevAxis - curScaledAxis)*scale)*(1-toi) + curX = curX + deltaX * oneMinusToi; curY = curY + deltaY * oneMinusToi; curZ = curZ + deltaZ * oneMinusToi; + //save adjusted values for discrete collision detection below curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; curRadius = curDot * curSlope + splat<3>(curCenter); curRadius = max(curRadius, gSimd4fZero) & ~culled; curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot; + //take offset from toi and add it to the cone center at the end curPos[0] = splat<0>(curCenter) + curX; curPos[1] = splat<1>(curCenter) + curY; curPos[2] = splat<2>(curCenter) + curZ; @@ -1210,7 +1240,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co T4f nz = deltaZ * rcpDelta; // calculate relative velocity - // velXYZ is scaled by one over the number of collisions since all collisions accumulate into + // velXYZ is scaled by one over the number of collisions (scale) since all collisions accumulate into // that variable during collision detection T4f rvx = curPos[0] - prevPos[0] - velX * scale; T4f rvy = curPos[1] - prevPos[1] - velY * scale; @@ -1227,7 +1257,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co // calculate magnitude of relative tangential velocity T4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon); - // magnitude of friction impulse (cannot be greater than -vt) + // magnitude of friction impulse (cannot be greater than -rvt) T4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, gSimd4fMinusOne) & mask; impulse[0] = rvtx * j; @@ -1244,7 +1274,7 @@ void cloth::SwCollision<T4f>::collideParticles() const T4f massScale = simd4f(mClothData.mCollisionMassScale); const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; - const T4f frictionScale = simd4f(mClothData.mFrictionScale); //[arameter set by user + const T4f frictionScale = simd4f(mClothData.mFrictionScale); //Parameter set by user T4f curPos[4]; T4f prevPos[4]; @@ -1439,21 +1469,22 @@ void cloth::SwCollision<T4f>::collideVirtualParticles() transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]); - q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]); - q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]); - q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]); - - q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]); - q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]); - q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]); - - q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]); - q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]); - q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]); - - q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]); - q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]); - q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]); + // (weight * frictionImpulse ) & mask away if particle is static + q0v0 = q0v0 - ((splat<0>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q0v1 = q0v1 - ((splat<1>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q0v2 = q0v2 - ((splat<2>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero)); + + q1v0 = q1v0 - ((splat<0>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q1v1 = q1v1 - ((splat<1>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q1v2 = q1v2 - ((splat<2>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero)); + + q2v0 = q2v0 - ((splat<0>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q2v1 = q2v1 - ((splat<1>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q2v2 = q2v2 - ((splat<2>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero)); + + q3v0 = q3v0 - ((splat<0>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q3v1 = q3v1 - ((splat<1>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero)); + q3v2 = q3v2 - ((splat<2>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero)); // write back prev particles storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0); @@ -1487,6 +1518,7 @@ void cloth::SwCollision<T4f>::collideVirtualParticles() T4f s2 = gSimd4fOne + splat<2>(weightScale) * (w2 & splat<2>(mask)); T4f s3 = gSimd4fOne + splat<3>(weightScale) * (w3 & splat<3>(mask)); + //we don't care if particles are static here since we are only scaling (zero for static) mass p0v0 = p0v0 * (gSimd4fOneXYZ | (splat<0>(s0) & sMaskW)); p0v1 = p0v1 * (gSimd4fOneXYZ | (splat<1>(s0) & sMaskW)); p0v2 = p0v2 * (gSimd4fOneXYZ | (splat<2>(s0) & sMaskW)); @@ -1504,21 +1536,22 @@ void cloth::SwCollision<T4f>::collideVirtualParticles() p3v2 = p3v2 * (gSimd4fOneXYZ | (splat<2>(s3) & sMaskW)); } - p0v0 = p0v0 + (splat<0>(rw0) * d0); - p0v1 = p0v1 + (splat<1>(rw0) * d0); - p0v2 = p0v2 + (splat<2>(rw0) * d0); - - p1v0 = p1v0 + (splat<0>(rw1) * d1); - p1v1 = p1v1 + (splat<1>(rw1) * d1); - p1v2 = p1v2 + (splat<2>(rw1) * d1); - - p2v0 = p2v0 + (splat<0>(rw2) * d2); - p2v1 = p2v1 + (splat<1>(rw2) * d2); - p2v2 = p2v2 + (splat<2>(rw2) * d2); - - p3v0 = p3v0 + (splat<0>(rw3) * d3); - p3v1 = p3v1 + (splat<1>(rw3) * d3); - p3v2 = p3v2 + (splat<2>(rw3) * d3); + // (weight * delta) & mask away if particle is static + p0v0 = p0v0 + ((splat<0>(rw0) * d0) & ~(splat<3>(p0v0) == gSimd4fZero)); + p0v1 = p0v1 + ((splat<1>(rw0) * d0) & ~(splat<3>(p0v1) == gSimd4fZero)); + p0v2 = p0v2 + ((splat<2>(rw0) * d0) & ~(splat<3>(p0v2) == gSimd4fZero)); + + p1v0 = p1v0 + ((splat<0>(rw1) * d1) & ~(splat<3>(p1v0) == gSimd4fZero)); + p1v1 = p1v1 + ((splat<1>(rw1) * d1) & ~(splat<3>(p1v1) == gSimd4fZero)); + p1v2 = p1v2 + ((splat<2>(rw1) * d1) & ~(splat<3>(p1v2) == gSimd4fZero)); + + p2v0 = p2v0 + ((splat<0>(rw2) * d2) & ~(splat<3>(p2v0) == gSimd4fZero)); + p2v1 = p2v1 + ((splat<1>(rw2) * d2) & ~(splat<3>(p2v1) == gSimd4fZero)); + p2v2 = p2v2 + ((splat<2>(rw2) * d2) & ~(splat<3>(p2v2) == gSimd4fZero)); + + p3v0 = p3v0 + ((splat<0>(rw3) * d3) & ~(splat<3>(p3v0) == gSimd4fZero)); + p3v1 = p3v1 + ((splat<1>(rw3) * d3) & ~(splat<3>(p3v1) == gSimd4fZero)); + p3v2 = p3v2 + ((splat<2>(rw3) * d3) & ~(splat<3>(p3v2) == gSimd4fZero)); // write back particles storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0); diff --git a/NvCloth/src/SwFabric.cpp b/NvCloth/src/SwFabric.cpp index 9830398..b8d617f 100644 --- a/NvCloth/src/SwFabric.cpp +++ b/NvCloth/src/SwFabric.cpp @@ -162,7 +162,7 @@ uint32_t cloth::SwFabric::getNumStiffnessValues() const uint32_t cloth::SwFabric::getNumSets() const { - return uint32_t(mSets.size() - 1); + return uint32_t(physx::PxMax(0, static_cast<int>(mSets.size() - 1))); } uint32_t cloth::SwFabric::getNumIndices() const diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp index 50be414..bc46ea6 100644 --- a/NvCloth/src/SwInterCollision.cpp +++ b/NvCloth/src/SwInterCollision.cpp @@ -49,6 +49,7 @@ const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0)); const Simd4fScalarFactory sEpsilon = simd4f(FLT_EPSILON); const Simd4fTupleFactory sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); +// Same as radixSort from SwSelfCollision.cpp but with uint32_t instead of uint16_t // returns sorted indices, output needs to be at least 2*(last - first) + 1024 void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out) { @@ -226,13 +227,15 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, uint32_t* sortedIndices = static_cast<uint32_t*>(allocator.allocate(numCloths * sizeof(uint32_t))); + // fill clothBounds, sortedIndices, and calculate totalClothBounds in world space for (uint32_t i = 0; i < numCloths; ++i) { const SwInterCollisionData& c = cBegin[i]; - // transform bounds from b local space to local space of a + // grow bounds with the collision distance colDist PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0])); NV_CLOTH_ASSERT(!lcBounds.isEmpty()); + // transform bounds to world space PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose,lcBounds); BoundingBox cBounds = { simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f), @@ -244,9 +247,11 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, totalClothBounds = expandBounds(totalClothBounds, cBounds); } - // sort indices by their minimum extent on the longest axis + // The sweep axis is the longest extent of totalClothBounds + // 0 = x axis, 1 = y axis, etc. so that vectors can be indexed using v[sweepAxis] const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower); + // sort indices by their minimum extent on the sweep axis ClothSorter<T4f> predicate(clothBounds, numCloths, sweepAxis); shdfnd::sort(sortedIndices, numCloths, predicate, nv::cloth::NonTrackingAllocator()); @@ -270,10 +275,10 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, uint32_t overlapMask = 0; uint32_t numOverlaps = 0; - // scan back to find first intersecting bounding box - uint32_t startIndex = i; - while (startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin) - --startIndex; + // scan forward to skip non intersecting bounds + uint32_t startIndex = 0; + while(startIndex < numCloths && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] < axisMin) + startIndex++; // compute all overlapping bounds for (uint32_t j = startIndex; j < numCloths; ++j) @@ -361,7 +366,7 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, // output each particle only once ++numParticles; - break; + break; // the particle only has to be inside one of the bounds, it doesn't matter if they are in more than one } } } @@ -401,7 +406,7 @@ void cloth::SwInterCollision<T4f>::operator()() // world bounds of particles BoundingBox<T4f> bounds = emptyBounds<T4f>(); - // calculate potentially colliding set + // calculate potentially colliding set (based on bounding boxes) { NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::BroadPhase", /*ProfileContext::None*/ 0); @@ -415,6 +420,8 @@ void cloth::SwInterCollision<T4f>::operator()() { NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::Collide", /*ProfileContext::None*/ 0); + //Note: this code is almost the same as cloth::SwSelfCollision<T4f>::operator() + T4f lowerBound = bounds.mLower; T4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon); @@ -423,11 +430,12 @@ void cloth::SwInterCollision<T4f>::operator()() uint32_t hashAxis0 = (sweepAxis + 1) % 3; uint32_t hashAxis1 = (sweepAxis + 2) % 3; - // reserve 0, 127, and 65535 for sentinel + // reserve 0, 255, and 65535 for sentinel T4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength); array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533; T4f one = gSimd4fOne; + // +1 for sentinel 0 offset T4f gridSize = simd4f(254.0f); array(gridSize)[sweepAxis] = 65534.0f; @@ -564,6 +572,7 @@ size_t cloth::SwInterCollision<T4f>::getBufferSize(uint32_t numParticles) template <typename T4f> void cloth::SwInterCollision<T4f>::collideParticle(uint32_t index) { + // The other particle is passed through member variables (mParticle) uint16_t clothIndex = mClothIndices[index]; if ((1 << clothIndex) & ~mClothMask) @@ -574,6 +583,8 @@ void cloth::SwInterCollision<T4f>::collideParticle(uint32_t index) uint32_t particleIndex = mParticleIndices[index]; T4f& particle = reinterpret_cast<T4f&>(instance->mParticles[particleIndex]); + + //very similar to cloth::SwSelfCollision<T4f>::collideParticles T4f diff = particle - mParticle; T4f distSqr = dot3(diff, diff); @@ -609,6 +620,8 @@ void cloth::SwInterCollision<T4f>::collideParticles(const uint32_t* keys, uint32 const uint32_t* indices, uint32_t numParticles, uint32_t collisionDistance) { + //very similar to cloth::SwSelfCollision<T4f>::collideParticles + const uint32_t bucketMask = uint16_t(-1); const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 }; @@ -639,8 +652,9 @@ void cloth::SwInterCollision<T4f>::collideParticles(const uint32_t* keys, uint32 ++kIt; kLast[k] = kIt; - // jump forward once to second column - kIt = keys + firstColumnSize; + // jump forward once to second column to go from cell offset 1 to 2 quickly + if(firstColumnSize) + kIt = keys + firstColumnSize; firstColumnSize = 0; } } diff --git a/NvCloth/src/SwSelfCollision.cpp b/NvCloth/src/SwSelfCollision.cpp index ec5a166..f39217b 100644 --- a/NvCloth/src/SwSelfCollision.cpp +++ b/NvCloth/src/SwSelfCollision.cpp @@ -45,6 +45,7 @@ namespace // returns sorted indices, output needs to be at least 2*(last - first) + 1024 void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out) { + // Note: This function is almost exactly duplicated in SwInterCollision.cpp // this sort uses a radix (bin) size of 256, requiring 4 bins to sort the 32 bit keys uint16_t n = uint16_t(last - first); diff --git a/NvCloth/src/SwSolverKernel.cpp b/NvCloth/src/SwSolverKernel.cpp index 2181b1e..ab612e2 100644 --- a/NvCloth/src/SwSolverKernel.cpp +++ b/NvCloth/src/SwSolverKernel.cpp @@ -276,6 +276,8 @@ void constrainSeparation(T4f* __restrict curIt, const T4f* __restrict curEnd, co } } + + /** traditional gauss-seidel internal constraint solver */ @@ -350,7 +352,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit)); } - //ex = er * stiffness / sqrt(epsilon + vw) + //ex = er * stiffness / (epsilon + inMass sum) T4f exij = erij * stij * recip(gSimd4fEpsilon + vwij); //h = h * ex @@ -375,7 +377,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons #include "sse2/SwSolveConstraints.h" #endif -// calculates upper bound of all position deltas +// Calculates upper bound of all position deltas template <typename T4f> T4f calculateMaxDelta(const T4f* prevIt, const T4f* curIt, const T4f* curEnd) { @@ -391,6 +393,11 @@ void applyWind(T4f* __restrict curIt, const T4f* __restrict prevIt, const uint16 const uint16_t* __restrict tEnd, float itrDtf, float dragCoefficientf, float liftCoefficientf, float fluidDensityf, T4f wind, const T4f (&rotation)[3]) { + // Note: Enabling wind can amplify bad behavior since the impulse scales with area, + // and the area of triangles increases when constraints are violated. + // Using the initial triangle area based on the rest length is one possible way to + // prevent this, but is expensive (and incorrect for intentionally stretchy cloth). + const T4f dragCoefficient = simd4f(dragCoefficientf); const T4f liftCoefficient = simd4f(liftCoefficientf); const T4f fluidDensity = simd4f(fluidDensityf); diff --git a/NvCloth/src/TripletScheduler.cpp b/NvCloth/src/TripletScheduler.cpp index 999be0e..0116200 100644 --- a/NvCloth/src/TripletScheduler.cpp +++ b/NvCloth/src/TripletScheduler.cpp @@ -39,45 +39,132 @@ cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets) { } -// SSE version +// helper functions for TripletScheduler::simd() +namespace +{ + //linear search should be fine for small particlesInBatchSize values + bool isIndexInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3) + { + for(int i = 0; i < particlesInBatchSize; i++) + { + if(particlesInBatch[i] == 0xffffffff) + return false; + if(index1 == particlesInBatch[i] || index2 == particlesInBatch[i] || index3 == particlesInBatch[i]) + return true; + } + return false; + } + + void markIndicesInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3) + { + for(int i = 0; i < particlesInBatchSize - 2; i++) + { + if(particlesInBatch[i] == 0xffffffff) + { + NV_CLOTH_ASSERT(i + 2 < particlesInBatchSize); + particlesInBatch[i] = index1; + particlesInBatch[i + 1] = index2; + particlesInBatch[i + 2] = index3; + break; + } + } + } +} + +/* +Group triplets in batches of simdWith so that they can be processed in parallel without referencing the same particle. +Not suitable for simdWidth with large values due to linear search. +Note that sets can be larger that simdWidth, and that particles may be referenced from the same set multiple times, + as long as they are not referenced more than once within a batch of simdWidth. The batch resets at the start of + each set. +*/ void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth) { - if (mTriplets.empty()) + PX_UNUSED(numParticles); + if(mTriplets.empty()) return; - Vector<uint32_t>::Type mark(numParticles, uint32_t(-1)); + const int particlesInBatchSize = simdWidth * 3; + uint32_t* particlesInBatch = new uint32_t[particlesInBatchSize]; //used to mark used indices in current batch + int setSize = 0; //number of triplets in current set + int amountOfPaddingNeeded = 0; - uint32_t setIndex = 0, setSize = 0; - for (TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex) + for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; tIt++) { - TripletIter tLast = tIt + std::min(simdWidth, uint32_t(tEnd - tIt)); - TripletIter tSwap = tEnd; - - for (; tIt != tLast && tIt != tSwap; ++tIt, ++setSize) + if(setSize%simdWidth == 0) { - // swap from tail until independent triplet found - while ((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap) - std::iter_swap(tIt, tSwap); + //reset particlesInBatch if we filled the simd width + memset(particlesInBatch, 0xff, sizeof(uint32_t)*particlesInBatchSize); + } - if (tIt == tSwap) - break; // no independent triplet found + TripletIter tSwap = tIt; - // mark vertices to be used in simdIndex - mark[tIt->x] = setIndex; - mark[tIt->y] = setIndex; - mark[tIt->z] = setIndex; + //Look for the next triplet that doesn't share indices with current batch + while(isIndexInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z)) + { + tSwap++; + if(tSwap == tEnd) + break; } - if (tIt == tSwap) // remaining triplets depend on current set + if(tSwap == tEnd) { - if (setSize > simdWidth) // trim set to multiple of simdWidth - { - uint32_t overflow = setSize % simdWidth; - setSize -= overflow; - tIt -= overflow; - } + //all remaining triplets are share indices with the current batch + + //This doesn't make sense, as it will just introduce an extra set with setSize<simdWidth + //Should double check though + // // trim set to multiple of simdWidth with the hope that the trimmed triples + // // will fit in the next set with a multiple of simdWidth size + // if(setSize > (int)simdWidth) + // { + // uint32_t overflow = setSize % simdWidth; + // setSize -= overflow; + // tIt -= overflow; + // } + + //finish set mSetSizes.pushBack(setSize); + amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth); //last modulo avoids adding padding when setSize%simdWidth==0 setSize = 0; + tIt--; //start next set with current triplet + } + else + { + //add triplet to set + markIndicesInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z); + std::iter_swap(tIt, tSwap); //Place triplet at the end of the current set, so that they in sequence in mTriplets + setSize++; + } + } + if(setSize) + { + //finish last set, if we have one + mSetSizes.pushBack(setSize); + amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth)))% static_cast<int>(simdWidth); + } + + // Padding code used to live in SwCloth::setVirtualParticles, now we do it here directly + mPaddedTriplets.reserve(mTriplets.size() + amountOfPaddingNeeded); + + { + Vec4us paddingDummy(static_cast<uint16_t>(numParticles), static_cast<uint16_t>(numParticles) + 1, static_cast<uint16_t>(numParticles) + 2, 0); + + TripletIter tIt = mTriplets.begin(); + //TripletIter tEnd = mTriplets.end(); + SetIter setIt = mSetSizes.begin(); + SetIter setEnd = mSetSizes.end(); + for(; setIt < setEnd; ++setIt) + { + for(unsigned int i = 0; i < *setIt; i++) + { + mPaddedTriplets.pushBack(*tIt); + ++tIt; + } + unsigned int padding = (static_cast<int>(simdWidth) - (*setIt % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth); + for(unsigned int i = 0; i < padding; i++) + { + mPaddedTriplets.pushBack(paddingDummy); + } } } } @@ -95,8 +182,10 @@ struct TripletSet } uint32_t mMark; // triplet index - uint8_t mNumReplays[3]; - uint8_t mNumConflicts[3][32]; + + // [3] because each particle is fetched on a different instruction. + uint8_t mNumReplays[3]; //how many times the instruction needs to be executed to resolve all bank conflicts (= maxElement(mNumConflicts[3][0...31])) + uint8_t mNumConflicts[3][32]; //Number of accesses for each of the 32 banks (if it is 1 then there is no conflict on that bank) }; /* @@ -132,83 +221,147 @@ void prefixSum(Iter first, Iter last, Iter dest) *dest = *(dest - 1) + *first; } } -} -// CUDA version -void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth) +class AdjacencyQuerier { - // NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16); +private: + typedef nv::cloth::Vector<uint32_t>::Type UintVector; + typedef nv::cloth::Vector<uint32_t>::Type::Iterator UintVectorIterator; + typedef nv::cloth::Vector<nv::cloth::Vec4u>::Type::ConstIterator ConstTripletIter; + UintVector mAdjacencyIndecies; + UintVector mAdjacencies; + uint32_t mMaxAdjacentCount; + +public: + AdjacencyQuerier(nv::cloth::Vector<nv::cloth::Vec4u>::Type const& triplets, int particleCount) + { + { + UintVector& adjacencyCount = mAdjacencyIndecies; //use same memory as mAdjacencyIndecies + adjacencyCount.resize(particleCount+1, uint32_t(0)); //initialize count to 0. Size+1 used later for prefixSum/indices - if (mTriplets.empty()) - return; + // count the number of triplets per particle + for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt) + { + for(int i = 0; i < 3; i++) //for each index in the triplet + { + const uint32_t particleIndex = (*tIt)[i]; + adjacencyCount[particleIndex] += 1; + } + } + //adjacentcyCount[i] now holds the number of triplets referring to particle i - TripletIter tIt, tEnd = mTriplets.end(); - uint32_t tripletIndex; + mMaxAdjacentCount = *physx::shdfnd::maxElement(adjacencyCount.begin(), adjacencyCount.end()); - // count number of triplets per particle - Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0)); - for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt) - for (int i = 0; i < 3; ++i) - ++adjacentCount[(*tIt)[i]]; + // compute in place prefix sum (inclusive) + prefixSum(adjacencyCount.begin(), adjacencyCount.end(), mAdjacencyIndecies.begin()); + // we use mAdjacencyIndecies from now on + } - /* neither of those were really improving number of batches: - // run simd version to pre-sort particles - simd(numParticles, blockWidth); mSetSizes.resize(0); - // sort according to triplet degree (estimated by sum of adjacentCount) - std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount)); - */ + mAdjacencies.resize(mAdjacencyIndecies.back()); + uint32_t tripletIndex = 0; + for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt, ++tripletIndex) + { + for(int i = 0; i < 3; i++) //for each index in the triplet + { + const uint32_t particleIndex = (*tIt)[i]; - uint32_t maxTripletCount = *shdfnd::maxElement(adjacentCount.begin(), adjacentCount.end()); + //Decrement mAdjacencyIndecies here to convert it from inclusive to exclusive prefix sum + const uint32_t adjacencyIndex = --mAdjacencyIndecies[particleIndex]; - // compute in place prefix sum (inclusive) - prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin()); + mAdjacencies[adjacencyIndex] = tripletIndex; + } + } + //mAdjacencies[mAdjacencyIndecies[i]] now stores the first triplet index referring to particle i + //mAdjacencies[mAdjacencyIndecies[i+1]] stores one past the last + } - // initialize adjacencies (for each particle, collect touching triplets) - // also converts partial sum in adjacentCount from inclusive to exclusive - Vector<uint32_t>::Type adjacencies(adjacentCount.back()); - for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex) - for (int i = 0; i < 3; ++i) - adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex; + uint32_t getMaxAdjacentCount() const + { + return mMaxAdjacentCount; + } + + nv::cloth::Range<const uint32_t> getAdjacentTriplets(uint32_t particleIndex) const + { + //use .begin() + offset to get around bounds check + auto begin = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex]); + auto end = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex + 1]); + return nv::cloth::Range<const uint32_t>(begin, end); + } +}; + +} + +// CUDA version. Doesn't add padding, optimizes for bank conflicts +void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth) +{ + // warpWidth has to be <= 32 and a power of two + NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16); + + if (mTriplets.empty()) + return; + + AdjacencyQuerier adjacencyQuerier(mTriplets, numParticles); uint32_t warpMask = warpWidth - 1; - uint32_t numSets = maxTripletCount; // start with minimum number of sets + uint32_t numSets = adjacencyQuerier.getMaxAdjacentCount(); // start with minimum number of sets Vector<TripletSet>::Type sets(numSets); + + // maps triplets to the set index that contains the triplet (setIndex = setIndices[tripletIndex]) Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1)); + mSetSizes.resize(numSets); + uint32_t tripletIndex = 0; + TripletIter tIt, tEnd = mTriplets.end(); + // color triplets (assign to sets) - Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd; - for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex) + for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt, ++tripletIndex) { - // mark sets of adjacent triplets + // mark sets with adjacent triplets for (int i = 0; i < 3; ++i) { uint32_t particleIndex = (*tIt)[i]; - aIt = aBegin + adjacentCount[particleIndex]; - aEnd = aBegin + adjacentCount[particleIndex + 1]; - for (uint32_t setIndex; aIt != aEnd; ++aIt) - if (numSets > (setIndex = setIndices[*aIt])) + Range<const uint32_t> adjacentTriplets = adjacencyQuerier.getAdjacentTriplets(particleIndex); + + //for all triplets adjacent to (sharing) particle 'particleIndex' + for (; adjacentTriplets.begin() != adjacentTriplets.end(); adjacentTriplets.popFront()) + { + uint32_t setIndex; + if(numSets > (setIndex = setIndices[adjacentTriplets.front()])) + { + //the set 'setIndex' has an adjacent triplet in it (sharing at least one vertex with tripletIndex) sets[setIndex].mMark = tripletIndex; + } + } } - // find valid set with smallest number of bank conflicts + // find valid set with smallest number of additional replays when adding triplet tIt uint32_t bestIndex = numSets; - uint32_t minReplays = 4; - for (uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex) + uint32_t minAdditionalReplays = 4; // one more than the maximum possible (for 3 particles) + for (uint32_t setIndex = 0; setIndex < numSets && minAdditionalReplays; ++setIndex) { const TripletSet& set = sets[setIndex]; if (set.mMark == tripletIndex) - continue; // triplet collision + continue; // triplet collision (this set contains an adjacent triplet) + // count additional replays caused by inserting triplet 'tIt' to in set 'setIndex' uint32_t numReplays = 0; - for (uint32_t i = 0; i < 3; ++i) - numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]]; + for(uint32_t i = 0; i < 3; ++i) //for each particle in the triplet + { + const uint32_t particleIndex = (*tIt)[i]; + const uint32_t bank = warpMask & particleIndex; - if (minReplays > numReplays) + // an additional replay will only be caused if the additional bank conflict is + // on a bank that is the current bottleneck + numReplays += set.mNumReplays[i] == set.mNumConflicts[i][bank]; + } + + // hold on to this set if it has less additional replays compared to our current best set + if (minAdditionalReplays > numReplays) { - minReplays = numReplays; + minAdditionalReplays = numReplays; bestIndex = setIndex; } } @@ -221,18 +374,26 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth) ++numSets; } - // increment bank conflicts or reset if warp filled + // increment mNumReplays or reset if warp filled TripletSet& set = sets[bestIndex]; if (++mSetSizes[bestIndex] & warpMask) + { for (uint32_t i = 0; i < 3; ++i) - set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]); + { + const uint32_t particleIndex = (*tIt)[i]; + const uint32_t bank = warpMask & particleIndex; + set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][bank]); + } + } else + { set = TripletSet(); + } setIndices[tripletIndex] = bestIndex; } - // reorder triplets + // reorder triplets so that the sets are in continuous memory Vector<uint32_t>::Type setOffsets(mSetSizes.size()); prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin()); @@ -243,3 +404,47 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth) mTriplets.swap(triplets); } + + +/* Interface used for unit tests */ +cloth::TripletSchedulerTestInterface::TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets) + : + mScheduler(new cloth::TripletScheduler(triplets)) +{} + +cloth::TripletSchedulerTestInterface::~TripletSchedulerTestInterface() +{ + delete mScheduler; +} +void cloth::TripletSchedulerTestInterface::simd(uint32_t numParticles, uint32_t simdWidth) +{ + mScheduler->simd(numParticles, simdWidth); +} +void cloth::TripletSchedulerTestInterface::warp(uint32_t numParticles, uint32_t warpWidth) +{ + mScheduler->warp(numParticles, warpWidth); +} + +cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetTriplets() +{ + return Range<const uint32_t>( + reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin()), + reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin() + mScheduler->mTriplets.size()) + ); +} +cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetSetSizes() +{ + return Range<const uint32_t>( + mScheduler->mSetSizes.begin(), + mScheduler->mSetSizes.begin() + mScheduler->mSetSizes.size() + ); +} + +NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]> triplets) +{ + return new nv::cloth::TripletSchedulerTestInterface(triplets); +} +NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface* interface) +{ + delete interface; +} diff --git a/NvCloth/src/TripletScheduler.h b/NvCloth/src/TripletScheduler.h index 26119cf..81dc889 100644 --- a/NvCloth/src/TripletScheduler.h +++ b/NvCloth/src/TripletScheduler.h @@ -43,6 +43,7 @@ struct TripletScheduler { typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter; typedef Vector<Vec4u>::Type::Iterator TripletIter; + typedef Vector<uint32_t>::Type::Iterator SetIter; TripletScheduler(Range<const uint32_t[4]>); void simd(uint32_t numParticles, uint32_t simdWidth); @@ -50,6 +51,32 @@ struct TripletScheduler Vector<Vec4u>::Type mTriplets; Vector<uint32_t>::Type mSetSizes; + Vector<Vec4us>::Type mPaddedTriplets; }; } } + +//Make TripletScheduler available for the unit tests. +namespace nv +{ +namespace cloth +{ +struct NV_CLOTH_IMPORT TripletSchedulerTestInterface +{ +private: + TripletScheduler* mScheduler; + +public: + TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets); + ~TripletSchedulerTestInterface(); + void simd(uint32_t numParticles, uint32_t simdWidth); + void warp(uint32_t numParticles, uint32_t warpWidth); + + Range<const uint32_t> GetTriplets(); + Range<const uint32_t> GetSetSizes(); +}; +} +} + +NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]>); +NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface*); diff --git a/NvCloth/src/cuda/CuCloth.h b/NvCloth/src/cuda/CuCloth.h index 8dc3082..f7d8268 100644 --- a/NvCloth/src/cuda/CuCloth.h +++ b/NvCloth/src/cuda/CuCloth.h @@ -131,7 +131,7 @@ class CuCloth : protected CuContextLock, public ClothImpl<CuCloth> uint32_t getSharedMemorySize() const; // without particle data // expects transformed configs, doesn't call notifyChanged() - void setPhaseConfigInternal(Range<const PhaseConfig>); + void setPhaseConfigInternal(Range<const PhaseConfig> configs); Range<physx::PxVec4> push(CuConstraints&); void clear(CuConstraints&); diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h index aeb2bda..f9b69f7 100644 --- a/NvCloth/src/cuda/CuCollision.h +++ b/NvCloth/src/cuda/CuCollision.h @@ -1160,9 +1160,10 @@ __device__ void apply(ParticleDataT& particleData, const int4& indices, const fl posX.y += delta.y * weights.x; posY.y += delta.y * weights.y; posZ.y += delta.y * weights.z; posX.z += delta.z * weights.x; posY.z += delta.z * weights.y; posZ.z += delta.z * weights.z; - particleData(indices.x) = posX; - particleData(indices.y) = posY; - particleData(indices.z) = posZ; + //if(particle is not static) store new position + if(particleData(indices.x,3)!= 0) particleData(indices.x) = posX; + if(particleData(indices.y,3)!= 0) particleData(indices.y) = posY; + if(particleData(indices.z,3)!= 0) particleData(indices.z) = posZ; } #else template <typename PointerT> diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp index 957f912..6794fa5 100644 --- a/NvCloth/src/cuda/CuFabric.cpp +++ b/NvCloth/src/cuda/CuFabric.cpp @@ -165,7 +165,8 @@ uint32_t cloth::CuFabric::getNumStiffnessValues() const uint32_t cloth::CuFabric::getNumSets() const { - return uint32_t(mSets.size() - 1); + return uint32_t(physx::PxMax(0, (int)mSets.size() - 1)); + } uint32_t cloth::CuFabric::getNumIndices() const diff --git a/NvCloth/src/cuda/CuFactory.cpp b/NvCloth/src/cuda/CuFactory.cpp index 397262c..df52dcd 100644 --- a/NvCloth/src/cuda/CuFactory.cpp +++ b/NvCloth/src/cuda/CuFactory.cpp @@ -97,6 +97,7 @@ void cloth::checkSuccessImpl(CUresult err, const char* file, const int line) ADD_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); ADD_CASE(CUDA_ERROR_LAUNCH_TIMEOUT); ADD_CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING); + ADD_CASE(CUDA_ERROR_ILLEGAL_ADDRESS); default: ADD_CASE(CUDA_ERROR_UNKNOWN); #undef ADD_CASE @@ -115,8 +116,9 @@ uint32_t getMaxThreadsPerBlock(CUcontext context) CUdevice device; checkSuccess(cuCtxGetDevice(&device)); - int major = 0; + int major = 0, minor = 0; checkSuccess(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); + checkSuccess(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); checkSuccess(cuCtxPopCurrent(nullptr)); @@ -149,7 +151,7 @@ cloth::Fabric* cloth::CuFactory::createFabric(uint32_t numParticles, Range<const Range<const float> tetherLengths, Range<const uint32_t> triangles) { return NV_CLOTH_NEW(CuFabric)(*this, numParticles, phaseIndices, sets, restvalues, stiffnessValues, indices, anchors, tetherLengths, triangles, - getNextFabricId()); + getNextFabricId()); } cloth::Cloth* cloth::CuFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric) @@ -210,6 +212,7 @@ void cloth::CuFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p if (!sets.empty()) { + // need to skip copying the first element NV_CLOTH_ASSERT(sets.size() == cuFabric.mSets.size() - 1); const uint32_t* deviceSets = cuFabric.mSets.begin().get(); copyToHost(deviceSets + 1, deviceSets + cuFabric.mSets.size(), sets.begin()); diff --git a/NvCloth/src/cuda/CuSolverKernelBlob.h b/NvCloth/src/cuda/CuSolverKernelBlob.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/NvCloth/src/cuda/CuSolverKernelBlob.h diff --git a/NvCloth/src/dx/DxCloth.cpp b/NvCloth/src/dx/DxCloth.cpp index 68c3289..0b1521b 100644 --- a/NvCloth/src/dx/DxCloth.cpp +++ b/NvCloth/src/dx/DxCloth.cpp @@ -84,9 +84,9 @@ cloth::DxCloth::DxCloth(DxFactory& factory, DxFabric& fabric, Range<const PxVec4 , mTargetCollisionPlanes(mFactory.mCollisionPlanes) , mStartCollisionTriangles(mFactory.mCollisionTriangles) , mTargetCollisionTriangles(mFactory.mCollisionTriangles) -, mVirtualParticleSetSizes(mFactory.mContextManager) -, mVirtualParticleIndices(mFactory.mContextManager) -, mVirtualParticleWeights(mFactory.mContextManager) +, mVirtualParticleSetSizes(mFactory.mVirtualParticleSetSizes) +, mVirtualParticleIndices(mFactory.mVirtualParticleIndices) +, mVirtualParticleWeights(mFactory.mVirtualParticleWeights) , mRestPositions(mFactory.mRestPositions) , mSelfCollisionIndices(mFactory.mSelfCollisionIndices) , mSelfCollisionParticles(mFactory.mSelfCollisionParticles) diff --git a/NvCloth/src/dx/DxCloth.h b/NvCloth/src/dx/DxCloth.h index 5e783fa..3a04734 100644 --- a/NvCloth/src/dx/DxCloth.h +++ b/NvCloth/src/dx/DxCloth.h @@ -91,11 +91,9 @@ class DxCloth : protected DxContextLock, public ClothImpl<DxCloth> typedef DxFabric FabricType; typedef DxContextLock ContextLockType; - template <typename> - struct MapTraits; - typedef DxVectorMap<DxBatchedVector<physx::PxVec3> > MappedVec3fVectorType; typedef DxVectorMap<DxBatchedVector<physx::PxVec4> > MappedVec4fVectorType; + typedef DxVectorMap<DxBatchedVector<Vec4us> > MappedVec4usVectorType; typedef DxVectorMap<DxBatchedVector<IndexPair> > MappedIndexVectorType; typedef DxVectorMap<DxBatchedVector<uint32_t> > MappedMaskVectorType; @@ -190,9 +188,9 @@ class DxCloth : protected DxContextLock, public ClothImpl<DxCloth> float mFriction; // virtual particles - DxDeviceVector<uint32_t> mVirtualParticleSetSizes; - DxDeviceVector<Vec4us> mVirtualParticleIndices; - DxDeviceVector<physx::PxVec4> mVirtualParticleWeights; + DxBatchedVector<uint32_t> mVirtualParticleSetSizes; + DxBatchedVector<Vec4us> mVirtualParticleIndices; + DxBatchedVector<physx::PxVec4> mVirtualParticleWeights; // self collision float mSelfCollisionDistance; diff --git a/NvCloth/src/dx/DxClothData.cpp b/NvCloth/src/dx/DxClothData.cpp index 57049bf..562ce7a 100644 --- a/NvCloth/src/dx/DxClothData.cpp +++ b/NvCloth/src/dx/DxClothData.cpp @@ -71,6 +71,8 @@ cloth::DxClothData::DxClothData(DxCloth& cloth) mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3; + mNumVirtualParticleSetSizes = cloth.mVirtualParticleSetSizes.size(); + mEnableContinuousCollision = cloth.mEnableContinuousCollision; mCollisionMassScale = cloth.mCollisionMassScale; mFrictionScale = cloth.mFriction; diff --git a/NvCloth/src/dx/DxClothData.h b/NvCloth/src/dx/DxClothData.h index 4da9be2..855a3c5 100644 --- a/NvCloth/src/dx/DxClothData.h +++ b/NvCloth/src/dx/DxClothData.h @@ -46,7 +46,7 @@ typedef unsigned int uint32_t; typedef int int32_t; #endif -static const uint32_t MaxParticlesInSharedMem = 1971; +static const uint32_t MaxParticlesInSharedMem = 1969; struct DxPhaseConfig @@ -124,6 +124,8 @@ struct DxClothData uint32_t mNumCollisionTriangles; + uint32_t mNumVirtualParticleSetSizes; + uint32_t mEnableContinuousCollision; //bool stored in uint32_t for dx alignment float mCollisionMassScale; float mFrictionScale; diff --git a/NvCloth/src/dx/DxFabric.cpp b/NvCloth/src/dx/DxFabric.cpp index 56b2b0a..cf6865a 100644 --- a/NvCloth/src/dx/DxFabric.cpp +++ b/NvCloth/src/dx/DxFabric.cpp @@ -62,30 +62,33 @@ cloth::DxFabric::DxFabric(DxFactory& factory, uint32_t numParticles, Range<const NV_CLOTH_ASSERT(restvalues.size() == stiffnessValues.size() || stiffnessValues.size() == 0); NV_CLOTH_ASSERT(mNumParticles > *shdfnd::maxElement(indices.begin(), indices.end())); - // manually convert uint32_t indices to uint16_t in temp memory - Vector<DxConstraint>::Type hostConstraints; - hostConstraints.resizeUninitialized(restvalues.size()); - Vector<DxConstraint>::Type::Iterator cIt = hostConstraints.begin(); - Vector<DxConstraint>::Type::Iterator cEnd = hostConstraints.end(); - - const uint32_t* iIt = indices.begin(); - const float* rIt = restvalues.begin(); - for (; cIt != cEnd; ++cIt) + //constraints { - cIt->mRestvalue = *rIt++; - cIt->mFirstIndex = uint16_t(*iIt++); - cIt->mSecondIndex = uint16_t(*iIt++); - } - - // copy to device vector in one go + // manually convert uint32_t indices to uint16_t in temp memory + Vector<DxConstraint>::Type hostConstraints; + hostConstraints.resizeUninitialized(restvalues.size()); + Vector<DxConstraint>::Type::Iterator cIt = hostConstraints.begin(); + Vector<DxConstraint>::Type::Iterator cEnd = hostConstraints.end(); + + const uint32_t* iIt = indices.begin(); + const float* rIt = restvalues.begin(); + for(; cIt != cEnd; ++cIt) + { + cIt->mRestvalue = *rIt++; + cIt->mFirstIndex = uint16_t(*iIt++); + cIt->mSecondIndex = uint16_t(*iIt++); + } + + // copy to device vector in one go #if 0 // Workaround for NvAPI SCG device updateSubresource size limit - mConstraintsHostCopy.assign(hostConstraints.begin(), hostConstraints.end()); - mConstraints.resize(mConstraintsHostCopy.size()); - mConstraints = mConstraintsHostCopy; + mConstraintsHostCopy.assign(hostConstraints.begin(), hostConstraints.end()); + mConstraints.resize(mConstraintsHostCopy.size()); + mConstraints = mConstraintsHostCopy; #else - mConstraints.assign(hostConstraints.begin(), hostConstraints.end()); + mConstraints.assign(hostConstraints.begin(), hostConstraints.end()); #endif + } mStiffnessValues.assign(stiffnessValues.begin(), stiffnessValues.end()); diff --git a/NvCloth/src/dx/DxFactory.cpp b/NvCloth/src/dx/DxFactory.cpp index 91f5125..7298ffe 100644 --- a/NvCloth/src/dx/DxFactory.cpp +++ b/NvCloth/src/dx/DxFactory.cpp @@ -96,6 +96,12 @@ typedef Vec4T<uint32_t> Vec4u; , mCollisionPlanesDeviceCopy(mContextManager) , mCollisionTriangles(mContextManager, DxStagingBufferPolicy()) , mCollisionTrianglesDeviceCopy(mContextManager) + , mVirtualParticleSetSizes(mContextManager, DxStagingBufferPolicy()) + , mVirtualParticleSetSizesDeviceCopy(mContextManager) + , mVirtualParticleIndices(mContextManager, DxStagingBufferPolicy()) + , mVirtualParticleIndicesDeviceCopy(mContextManager) + , mVirtualParticleWeights(mContextManager, DxStagingBufferPolicy()) + , mVirtualParticleWeightsDeviceCopy(mContextManager) , mMotionConstraints(mContextManager) , mSeparationConstraints(mContextManager) , mRestPositions(mContextManager, DxStagingBufferPolicy()) @@ -205,7 +211,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p if (!sets.empty()) { - // need to skip copying the first element + // we don't skip first element here NV_CLOTH_ASSERT(sets.size() == dxFabric.mSets.size()); memcpy(sets.begin(), dxFabric.mSets.begin(), sets.size() * sizeof(uint32_t)); } @@ -390,8 +396,11 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p uint32_t numWeights = cloth.getNumVirtualParticleWeights(); Vector<PxVec4>::Type hostWeights(numWeights, PxVec4(0.0f)); - copyToHost(hostWeights.begin(), dxCloth.mVirtualParticleWeights.mBuffer.mBuffer, 0, - hostWeights.size() * sizeof(PxVec4)); + //copyToHost(hostWeights.begin(), dxCloth.mVirtualParticleWeights.mBuffer.mBuffer, 0, + // hostWeights.size() * sizeof(PxVec4)); + NV_CLOTH_ASSERT(hostWeights.size() == dxCloth.mVirtualParticleWeights.size()); + intrinsics::memCopy(hostWeights.begin(), DxCloth::MappedVec4fVectorType(const_cast<DxCloth&>(dxCloth).mVirtualParticleWeights).begin(), + destIndices.size() * sizeof(uint32_t)); // convert weights to Vec3f PxVec3* destIt = reinterpret_cast<PxVec3*>(destWeights.begin()); @@ -408,8 +417,11 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p uint32_t numIndices = cloth.getNumVirtualParticles(); Vector<Vec4us>::Type hostIndices(numIndices); - copyToHost(hostIndices.begin(), dxCloth.mVirtualParticleIndices.mBuffer.mBuffer, 0, - hostIndices.size() * sizeof(Vec4us)); + //copyToHost(hostIndices.begin(), dxCloth.mVirtualParticleIndices.mBuffer.mBuffer, 0, + // hostIndices.size() * sizeof(Vec4us)); + NV_CLOTH_ASSERT(hostIndices.size() == dxCloth.mVirtualParticleIndices.size()); + intrinsics::memCopy(hostIndices.begin(), DxCloth::MappedVec4usVectorType(const_cast<DxCloth&>(dxCloth).mVirtualParticleIndices).begin(), + destIndices.size() * sizeof(uint32_t)); // convert indices to 32 bit Vec4u* destIt = reinterpret_cast<Vec4u*>(destIndices.begin()); diff --git a/NvCloth/src/dx/DxFactory.h b/NvCloth/src/dx/DxFactory.h index 23d5382..d5f985f 100644 --- a/NvCloth/src/dx/DxFactory.h +++ b/NvCloth/src/dx/DxFactory.h @@ -35,6 +35,7 @@ #include "../IndexPair.h" #include <foundation/PxVec4.h> #include <foundation/PxVec3.h> +#include "Vec4T.h" #if _MSC_VER >= 1700 #pragma warning(disable : 4471) @@ -81,8 +82,8 @@ protected: virtual Cloth* clone(const Cloth& cloth); - virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phaseIndices, Range<uint32_t> sets, - Range<float> restvalues, Range<float> stiffnessValues, Range<uint32_t> indices, Range<uint32_t> anchors, + virtual void extractFabricData( const Fabric& fabric, Range<uint32_t> phaseIndices, Range<uint32_t> sets, + Range<float> restvalues, Range<float> stiffnessValues, Range<uint32_t> indices, Range<uint32_t> anchors, Range<float> tetherLengths, Range<uint32_t> triangles) const; virtual void extractCollisionData(const Cloth& cloth, Range<physx::PxVec4> spheres, Range<uint32_t> capsules, @@ -145,6 +146,12 @@ protected: DxBatchedStorage<physx::PxVec3> mCollisionTriangles; DxBuffer<physx::PxVec3> mCollisionTrianglesDeviceCopy; + DxBatchedStorage<uint32_t> mVirtualParticleSetSizes; + DxBuffer<uint32_t> mVirtualParticleSetSizesDeviceCopy; + DxBatchedStorage<Vec4us> mVirtualParticleIndices; + DxBuffer<Vec4us> mVirtualParticleIndicesDeviceCopy; + DxBatchedStorage<physx::PxVec4> mVirtualParticleWeights; + DxBuffer<physx::PxVec4> mVirtualParticleWeightsDeviceCopy; DxBatchedStorage<physx::PxVec4> mMotionConstraints; DxBatchedStorage<physx::PxVec4> mSeparationConstraints; @@ -157,6 +164,7 @@ protected: DxBatchedStorage<uint32_t> mSelfCollisionData; DxBatchedStorage<uint32_t> mTriangles; + }; } } diff --git a/NvCloth/src/dx/DxSolver.cpp b/NvCloth/src/dx/DxSolver.cpp index 66a8d8f..21caa0b 100644 --- a/NvCloth/src/dx/DxSolver.cpp +++ b/NvCloth/src/dx/DxSolver.cpp @@ -350,6 +350,9 @@ void cloth::DxSolver::beginFrame() mFactory.mConvexMasksDeviceCopy = mFactory.mConvexMasks.mBuffer; mFactory.mCollisionPlanesDeviceCopy = mFactory.mCollisionPlanes.mBuffer; mFactory.mCollisionTrianglesDeviceCopy = mFactory.mCollisionTriangles.mBuffer; + mFactory.mVirtualParticleSetSizesDeviceCopy = mFactory.mVirtualParticleSetSizes.mBuffer; + mFactory.mVirtualParticleIndicesDeviceCopy = mFactory.mVirtualParticleIndices.mBuffer; + mFactory.mVirtualParticleWeightsDeviceCopy = mFactory.mVirtualParticleWeights.mBuffer; // mFactory.mParticleAccelerations = mFactory.mParticleAccelerationsHostCopy; mFactory.mRestPositionsDeviceCopy = mFactory.mRestPositions.mBuffer; } @@ -369,7 +372,8 @@ void cloth::DxSolver::executeKernel() { context->CSSetShader(mFactory.mSolverKernelComputeShader, NULL, 0); - ID3D11ShaderResourceView* resourceViews[18] = { + // Set shader StructuredBuffer registers t0-t20 + ID3D11ShaderResourceView* resourceViews[21] = { mClothData.mBuffer.resourceView(), /*mFrameData.mBuffer.resourceView()*/NULL, mIterationData.mBuffer.resourceView(), mFactory.mPhaseConfigs.mBuffer.resourceView(), mFactory.mConstraints.mBuffer.resourceView(), mFactory.mTethers.mBuffer.resourceView(), @@ -382,10 +386,15 @@ void cloth::DxSolver::executeKernel() mFactory.mRestPositionsDeviceCopy.resourceView(), mFactory.mSelfCollisionIndices.mBuffer.resourceView(), mFactory.mStiffnessValues.mBuffer.resourceView(), - mFactory.mTriangles.mBuffer.resourceView() + mFactory.mTriangles.mBuffer.resourceView(), + + mFactory.mVirtualParticleSetSizesDeviceCopy.resourceView(), + mFactory.mVirtualParticleIndicesDeviceCopy.resourceView(), + mFactory.mVirtualParticleWeightsDeviceCopy.resourceView() }; - context->CSSetShaderResources(0, 18, resourceViews); + context->CSSetShaderResources(0, 21, resourceViews); + // Set shader RWStructuredBuffer registers u0-u4 ID3D11UnorderedAccessView* accessViews[4] = { mFactory.mParticles.mBuffer.accessViewRaw(), mFactory.mSelfCollisionParticles.mBuffer.accessView(), @@ -398,10 +407,10 @@ void cloth::DxSolver::executeKernel() context->CSSetShader(NULL, NULL, 0); - ID3D11ShaderResourceView* resourceViewsNULL[17] = { - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL + ID3D11ShaderResourceView* resourceViewsNULL[21] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; - context->CSSetShaderResources(0, 17, resourceViewsNULL); + context->CSSetShaderResources(0, 21, resourceViewsNULL); ID3D11UnorderedAccessView* accessViewsNULL[4] = { NULL, NULL, NULL, NULL }; context->CSSetUnorderedAccessViews(0, 4, accessViewsNULL, NULL); #if 0 diff --git a/NvCloth/src/dx/DxSolverKernel.hlsl b/NvCloth/src/dx/DxSolverKernel.hlsl index 4d91f4b..9c02468 100644 --- a/NvCloth/src/dx/DxSolverKernel.hlsl +++ b/NvCloth/src/dx/DxSolverKernel.hlsl @@ -77,6 +77,10 @@ StructuredBuffer<float> bPerConstraintStiffness : register(t16); //Note that the buffer actually contains uint16_t values StructuredBuffer<uint32_t> bTriangles : register(t17); +StructuredBuffer<int32_t> bVirtualParticleSetSizes : register(t18); +StructuredBuffer<uint4> bVirtualParticleIndices : register(t19); +StructuredBuffer<float4> bVirtualParticleWeights : register(t20); + groupshared DxClothData gClothData; groupshared DxFrameData gFrameData; groupshared DxIterationData gIterData; @@ -864,7 +868,7 @@ void collideCapsules(IParticles curParticles, IParticles prevParticles, uint32_t static const float gSkeletonWidth = (1.f - 0.2f) * (1.f - 0.2f) - 1.f; -uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevAlpha, out float3 outDelta, out float3 outVelocity) +uint32_t collideCapsules(float3 curPosAbsolute, float3 prevPosAbsolute, float alpha, float prevAlpha, out float3 outDelta, out float3 outVelocity) { outDelta = float3(0.0f, 0.0f, 0.0f); outVelocity = float3(0.0f, 0.0f, 0.0f); @@ -881,87 +885,87 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA { IndexPair indices = bCapsuleIndices[capsuleOffset + i]; - // current + // load sphere data float4 startSphere0 = bCollisionSpheres[startSphereOffset + indices.first]; float4 targetSphere0 = bCollisionSpheres[targetSphereOffset + indices.first]; float4 startSphere1 = bCollisionSpheres[startSphereOffset + indices.second]; float4 targetSphere1 = bCollisionSpheres[targetSphereOffset + indices.second]; - // prev + // interpolate sphere data to end of previous iteration float4 prevSphere0 = lerp(startSphere0, targetSphere0, prevAlpha); float4 prevSphere1 = lerp(startSphere1, targetSphere1, prevAlpha); prevSphere0.w = max(prevSphere0.w, 0.0f); prevSphere1.w = max(prevSphere1.w, 0.0f); - float4 prevAxis = (prevSphere1 - prevSphere0) * 0.5f; float3 prevConeCenter = (prevSphere1.xyz + prevSphere0.xyz) * 0.5f; + float4 prevHalfAxis = (prevSphere1 - prevSphere0) * 0.5f; //half axis + //axis.w = half of radii difference - float3 prevDelta = prevPos - prevConeCenter; + float3 prevPos = prevPosAbsolute - prevConeCenter; - float prevSqrAxisLength = dot(prevAxis.xyz, prevAxis.xyz); - float prevSqrConeLength = prevSqrAxisLength - prevAxis.w * prevAxis.w; + float prevSqrAxisHalfLength = dot(prevHalfAxis.xyz, prevHalfAxis.xyz); + float prevSqrConeHalfLength = prevSqrAxisHalfLength - prevHalfAxis.w * prevHalfAxis.w; - if (prevSqrAxisLength <= 0.0f) + if (prevSqrConeHalfLength <= 0.0f) continue; - float prevInvAxisLength = rsqrt(prevSqrAxisLength); - float prevInvConeLength = rsqrt(prevSqrConeLength); + float prevInvAxisHalfLength = rsqrt(prevSqrAxisHalfLength); + float prevInvConeHalfLength = rsqrt(prevSqrConeHalfLength); - float prevAxisLength = prevSqrAxisLength * prevInvAxisLength; + float prevAxisHalfLength = prevSqrAxisHalfLength * prevInvAxisHalfLength; - float prevConeRadius = (prevAxis.w + prevSphere0.w) * prevInvConeLength * prevAxisLength; + float prevConeRadius = (prevHalfAxis.w + prevSphere0.w) * prevInvConeHalfLength * prevAxisHalfLength; - float3 prevConeAxis = prevAxis.xyz * prevInvAxisLength; - float prevConeSlope = prevAxis.w * prevInvConeLength; + float3 prevConeAxis = prevHalfAxis.xyz * prevInvAxisHalfLength; + float prevConeSlope = prevHalfAxis.w * prevInvConeHalfLength; - float3 prevCross = cross(prevDelta, prevConeAxis); + float3 prevCross = cross(prevPos, prevConeAxis); float prevDot = dot(prevPos, prevConeAxis); - // current - float4 sphere0 = lerp(startSphere0, targetSphere0, alpha); - float4 sphere1 = lerp(startSphere1, targetSphere1, alpha); + // interpolate sphere data to end of current iteration + float4 curSphere0 = lerp(startSphere0, targetSphere0, alpha); + float4 curSphere1 = lerp(startSphere1, targetSphere1, alpha); - sphere0.w = max(sphere0.w, 0.0f); - sphere1.w = max(sphere1.w, 0.0f); + curSphere0.w = max(curSphere0.w, 0.0f); + curSphere1.w = max(curSphere1.w, 0.0f); - float4 curAxis = (sphere1 - sphere0) * 0.5f; - float3 curConeCenter = (sphere1.xyz + sphere0.xyz) * 0.5f; + float3 curConeCenter = (curSphere1.xyz + curSphere0.xyz) * 0.5f; + float4 curHalfAxis = (curSphere1 - curSphere0) * 0.5f; //half axis + //axis.w = half of radii difference - float3 curDelta = curPos - curConeCenter; + float3 curPos = curPosAbsolute - curConeCenter; - float sqrAxisLength = dot(curAxis.xyz, curAxis.xyz); - float sqrConeLength = sqrAxisLength - curAxis.w * curAxis.w; + float curSqrAxisHalfLength = dot(curHalfAxis.xyz, curHalfAxis.xyz); + float curSqrConeHalfLength = curSqrAxisHalfLength - curHalfAxis.w * curHalfAxis.w; - if (sqrConeLength <= 0.0f) + if(curSqrConeHalfLength <= 0.0f) continue; - float invAxisLength = rsqrt(sqrAxisLength); - float invConeLength = rsqrt(sqrConeLength); + float curInvAxisHalfLength = rsqrt(curSqrAxisHalfLength); + float curInvConeHalfLength = rsqrt(curSqrConeHalfLength); - float axisLength = sqrAxisLength * invAxisLength; + float curAxisHalfLength = curSqrAxisHalfLength * curInvAxisHalfLength; - float3 coneCenter = (sphere1.xyz + sphere0.xyz) * 0.5f; - float coneRadius = (curAxis.w + sphere0.w) * invConeLength * axisLength; + float curConeRadius = (curHalfAxis.w + curSphere0.w) * curInvConeHalfLength * curAxisHalfLength; - float3 curConeAxis = curAxis.xyz * invAxisLength; + float3 curConeAxis = curHalfAxis.xyz * curInvAxisHalfLength; - float3 curCross = cross(curDelta, curConeAxis); + float3 curCross = cross(curPos, curConeAxis); float curDot = dot(curPos, curConeAxis); float curSqrDistance = FLT_EPSILON + dot(curCross, curCross); - float prevRadius = max(prevDot * prevConeSlope + coneRadius, 0.0f); + float prevRadius = max(prevDot * prevConeSlope + prevConeRadius, 0.0f); - float curSlope = curAxis.w * invConeLength; - float curRadius = max(curDot * curSlope + coneRadius, 0.0f); + float curConeSlope = curHalfAxis.w * curInvConeHalfLength; + float curRadius = max(curDot * curConeSlope + curConeRadius, 0.0f); - float sine = curAxis.w * invAxisLength; - float coneSqrCosine = 1.f - sine * sine; - float curHalfLength = axisLength; + float sine = curHalfAxis.w * curInvAxisHalfLength; + float coneSqrCosine = 1.f - (sine * sine); - float dotPrevPrev = dot(prevCross, prevCross) - prevCross.x * prevCross.x - prevRadius * prevRadius; + float dotPrevPrev = dot(prevCross, prevCross) - prevRadius * prevRadius; float dotPrevCur = dot(prevCross, curCross) - prevRadius * curRadius; float dotCurCur = curSqrDistance - curRadius * curRadius; @@ -972,7 +976,7 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA // time of impact or 0 if prevPos inside cone float toi = min(0.0f, halfB + sqrtD) / minusA; - bool hasCollision = toi < 1.0f && halfB < sqrtD; + bool hasCollision = (toi < 1.0f) && (halfB < sqrtD); // skip continuous collision if the (un-clamped) particle // trajectory only touches the outer skin of the cone. @@ -990,46 +994,29 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA // interpolate delta at toi float3 pos = prevPos - delta * toi; - // float axisLength = sqrAxisLength * invAxisLength; - - // float curHalfLength = axisLength; // ? - float3 curConeAxis = curAxis.xyz * invAxisLength; - float3 curScaledAxis = curAxis.xyz * curHalfLength; - - float4 prevAxis = (prevSphere1 - prevSphere0) * 0.5f; - float3 prevConeCenter = (prevSphere1.xyz + prevSphere0.xyz) * 0.5f; - - float prevSqrAxisLength = dot(prevAxis.xyz, prevAxis.xyz); - float prevSqrConeLength = prevSqrAxisLength - prevAxis.w * prevAxis.w; - - float prevInvAxisLength = rsqrt(prevSqrAxisLength); - float prevInvConeLength = rsqrt(prevSqrConeLength); - - float prevAxisLength = prevSqrAxisLength * prevInvAxisLength; - float prevConeRadius = (prevAxis.w + prevSphere0.w) * prevInvConeLength * prevAxisLength; - - float3 prevConeAxis = prevAxis.xyz * prevInvAxisLength; - - float prevHalfLength = prevAxisLength; - - float3 deltaScaledAxis = curScaledAxis - prevAxis.xyz * prevHalfLength; + float3 curScaledAxis = curConeAxis.xyz * curAxisHalfLength; + float3 deltaScaledAxis = curScaledAxis - prevConeAxis * prevAxisHalfLength; float oneMinusToi = 1.0f - toi; // interpolate axis at toi float3 axis = curScaledAxis - deltaScaledAxis * oneMinusToi; - float slope = prevConeSlope * oneMinusToi + curSlope * toi; + float slope = prevConeSlope * oneMinusToi + curConeSlope * toi; - float sqrHalfLength = dot(axis, axis); // axisX * axisX + axisY * axisY + axisZ * axisZ; + float sqrHalfLength = dot(axis, axis); float invHalfLength = rsqrt(sqrHalfLength); + // distance along toi cone axis (from center) float dotf = dot(pos, axis) * invHalfLength; + //point line distance float sqrDistance = dot(pos, pos) - dotf * dotf; float invDistance = sqrDistance > 0.0f ? rsqrt(sqrDistance) : 0.0f; + //offset base to take slope in to account float base = dotf + slope * sqrDistance * invDistance; float scale = base * invHalfLength; + // use invHalfLength to map base from [-HalfLength, +HalfLength]=inside to [-1, +1] =inside if (abs(scale) < 1.0f) { @@ -1037,99 +1024,49 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) - float minusK = sqrtD / (minusA * oneMinusToi); + float minusK = (oneMinusToi > FLT_EPSILON) ? sqrtD / (minusA * oneMinusToi) : 0.0f; oneMinusToi = oneMinusToi / (1.f - minusK); - curDelta += delta * oneMinusToi; - - curDot = dot(curDelta, curAxis.xyz); - float curConeRadius = (curAxis.w + sphere0.w) * invConeLength * axisLength; + curPos += delta * oneMinusToi; - curRadius = max(curDot * curSlope + curConeRadius, 0.0f); // Duplicate? - curSqrDistance = dot(curDelta, curDelta) - curDot * curDot; + curDot = dot(curPos, curConeAxis.xyz); + curRadius = max(curDot * curConeSlope + curConeRadius, 0.0f); + curSqrDistance = dot(curPos, curPos) - curDot * curDot; - curPos = coneCenter + curDelta; + //curPos is relative to coneCenter so we don't need to do this: + //curPos = coneCenter + curPos; } } - - { - float3 delta = curPos - coneCenter; - - float deltaDotAxis = dot(delta, curConeAxis); - float radius = max(deltaDotAxis * curSlope + coneRadius, 0.0f); - float sqrDistance = dot(delta, delta) - deltaDotAxis * deltaDotAxis; - if (sqrDistance > radius * radius) + { + if (curSqrDistance > curRadius * curRadius) continue; - sqrDistance = max(sqrDistance, FLT_EPSILON); - float invDistance = rsqrt(sqrDistance); + curSqrDistance = max(curSqrDistance, FLT_EPSILON); + float invDistance = rsqrt(curSqrDistance); - float base = deltaDotAxis + curSlope * sqrDistance * invDistance; - float halfLength = axisLength; - // float halfLength = coneHalfLength; + float base = curDot + curConeSlope * curSqrDistance * invDistance; + float halfLength = curAxisHalfLength; if (abs(base) < halfLength) { - delta = delta - base * curAxis; + float3 delta = curPos - base * curConeAxis; float sqrCosine = coneSqrCosine; - float scale = radius * invDistance * sqrCosine - sqrCosine; + float scale = curRadius * invDistance * sqrCosine - sqrCosine; - outDelta += delta * scale; + outDelta += delta *scale; if (frictionEnabled) { - // get previous sphere pos - float4 prevSphere0 = lerp(startSphere0, targetSphere0, prevAlpha); - float4 prevSphere1 = lerp(startSphere1, targetSphere1, prevAlpha); - // interpolate velocity between the two spheres - float t = deltaDotAxis * 0.5f + 0.5f; - outVelocity += lerp(sphere0.xyz - prevSphere0.xyz, sphere1.xyz - prevSphere1.xyz, t); + float t = curDot * 0.5f + 0.5f; + outVelocity += lerp(curSphere0.xyz - prevSphere0.xyz, curSphere1.xyz - prevSphere1.xyz, t); } ++numCollisions; } } - - // curPos inside cone (discrete collision) - bool hasContact = curRadius * curRadius > curSqrDistance; - - if (!hasContact) - continue; - - float invDistance = curSqrDistance > 0.0f ? rsqrt(curSqrDistance) : 0.0f; - float base = curDot + curSlope * curSqrDistance * invDistance; - - // float axisLength = sqrAxisLength * invAxisLength; - - // float halfLength = axisLength; // ? - // float halfLength = coneHalfLength; - /* - if (abs(base) < halfLength) - { - float3 delta = curPos - base * curAxis.xyz; - - float sine = axis.w * invAxisLength; // Remove? - float sqrCosine = 1.f - sine * sine; - - float scale = curRadius * invDistance * coneSqrCosine - coneSqrCosine; - - - // delta += de - - outDelta += delta * scale; - - if (frictionEnabled) - { - // interpolate velocity between the two spheres - float t = curDot * 0.5f + 0.5f; - outVelocity += lerp(sphere0.xyz - prevSphere0.xyz, sphere1.xyz - prevSphere1.xyz, t); - } - - ++numCollisions; - }*/ } // sphere collision @@ -1147,8 +1084,8 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA float prevRadius = prevSphere.w; { - float3 curDelta = curPos - sphere.xyz; - float3 prevDelta = prevPos - prevSphere.xyz; + float3 curDelta = curPosAbsolute - sphere.xyz; + float3 prevDelta = prevPosAbsolute - prevSphere.xyz; float sqrDistance = FLT_EPSILON + dot(curDelta, curDelta); @@ -1186,7 +1123,7 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA oneMinusToi = oneMinusToi / (1.f - minusK); curDelta += delta * oneMinusToi; - curPos = sphere.xyz + curDelta; + curPosAbsolute = sphere.xyz + curDelta; sqrDistance = FLT_EPSILON + dot(curDelta, curDelta); } @@ -1212,6 +1149,125 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA return numCollisions; } +float3 barycentricLerp(float3 pA, float3 pB, float3 pC, float3 weights) +{ + float3 outP; + outP.x = pA.x * weights.x + pB.x * weights.y + pC.x * weights.z; + outP.y = pA.y * weights.x + pB.y * weights.y + pC.y * weights.z; + outP.z = pA.z * weights.x + pB.z * weights.y + pC.z * weights.z; + return outP; +} + +void applyVirtualImpulse(IParticles particles, uint3 indices, float3 weights, float3 delta, float scale) +//apply +{ + float4 pA = particles.get(indices.x); + float4 pB = particles.get(indices.y); + float4 pC = particles.get(indices.z); + + delta *= scale; + + pA.x += delta.x * weights.x; pB.x += delta.x * weights.y; pC.x += delta.x * weights.z; + pA.y += delta.y * weights.x; pB.y += delta.y * weights.y; pC.y += delta.y * weights.z; + pA.z += delta.z * weights.x; pB.z += delta.z * weights.y; pC.z += delta.z * weights.z; + + //if(particle is not static) store new position + if(pA.w != 0.0f) particles.set(indices.x, pA); + if(pB.w != 0.0f) particles.set(indices.y, pB); + if(pC.w != 0.0f) particles.set(indices.z, pC); +} + +void collideVirtualCapsules(IParticles curParticles, IParticles prevParticles, uint32_t threadIdx, float alpha, float prevAlpha) +{ + uint virtualParticleSetSizesCount = gClothData.mNumVirtualParticleSetSizes; + + if(virtualParticleSetSizesCount == 0) + return; + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + if(gClothData.mEnableContinuousCollision) + { + //??? + } + + uint indicesEnd = 0; + for(uint vpSetSizeIt = 0; vpSetSizeIt < virtualParticleSetSizesCount; vpSetSizeIt++) + { + GroupMemoryBarrierWithGroupSync(); + uint indicesIt = indicesEnd + threadIdx * 4; + for(indicesEnd += bVirtualParticleSetSizes[vpSetSizeIt]*4; indicesIt < indicesEnd; indicesIt += blockDim * 4) + { + uint4 indices; + { + uint4 tmp = bVirtualParticleIndices[indicesIt>>1]; + if(indicesIt & 1 == 1) + { + indices.x = tmp.z & 0xffff; + indices.y = tmp.z >> 16; + indices.z = tmp.w & 0xffff; + indices.w = tmp.w >> 16; + } + else + { + indices.x = tmp.x & 0xffff; + indices.y = tmp.x >> 16; + indices.z = tmp.y & 0xffff; + indices.w = tmp.y >> 16; + } + } + + float4 weights = bVirtualParticleWeights[indices.w]; + + float3 curPos = barycentricLerp(curParticles.get(indices.x).xyz, curParticles.get(indices.y).xyz, curParticles.get(indices.z).xyz, weights.xyz); + + float3 delta, velocity; + uint32_t numCollisions = collideCapsules(curPos.xyz, alpha, prevAlpha, delta, velocity); + if(numCollisions > 0) + { + float scale = 1.0f / (float)numCollisions; + float wscale = weights.w * scale; + + applyVirtualImpulse(curParticles, indices.xyz, weights.xyz, delta, scale); + + if(frictionEnabled) + { + float3 prevPos = barycentricLerp(prevParticles.get(indices.x).xyz, prevParticles.get(indices.y).xyz, prevParticles.get(indices.z).xyz, weights.xyz); + + float3 frictionImpulse = + calcFrictionImpulse(prevPos.xyz, curPos.xyz, velocity, scale, delta); + + applyVirtualImpulse(prevParticles, indices.xyz, weights.xyz, frictionImpulse, -weights.w); + } + + if(massScaleEnabled) + { + float deltaLengthSq = dot(delta.xyz, delta.xyz) * scale * scale; + float invMassScale = 1.0f / (1.0f + gClothData.mCollisionMassScale * deltaLengthSq); + + // not multiplying by weights[3] here because unlike applying velocity + // deltas where we want the interpolated position to obtain a particular + // value, we instead just require that the total change is equal to invMassScale + invMassScale = invMassScale - 1.0f; + + float4 pA = curParticles.get(indices.x); + float4 pB = curParticles.get(indices.y); + float4 pC = curParticles.get(indices.z); + + pA.w *= 1.0f + weights.x * invMassScale; + pB.w *= 1.0f + weights.y * invMassScale; + pC.w *= 1.0f + weights.z * invMassScale; + + curParticles.set(indices.x,pA); + curParticles.set(indices.y,pB); + curParticles.set(indices.z,pC); + } + } + } + } +} + void collideContinuousCapsules(IParticles curParticles, IParticles prevParticles, uint32_t threadIdx, float alpha, float prevAlpha) { bool frictionEnabled = gClothData.mFrictionScale > 0.0f; @@ -1261,6 +1317,7 @@ void collideParticles(IParticles curParticles, IParticles prevParticles, uint32_ collideContinuousCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha); else collideCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha); + collideVirtualCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha); } void constrainSeparation(IParticles curParticles, uint32_t threadIdx, float alpha) diff --git a/NvCloth/src/scalar/SwCollisionHelpers.h b/NvCloth/src/scalar/SwCollisionHelpers.h index 3ab756f..c86d939 100644 --- a/NvCloth/src/scalar/SwCollisionHelpers.h +++ b/NvCloth/src/scalar/SwCollisionHelpers.h @@ -69,13 +69,17 @@ struct Gather<Scalar4i> Gather<Scalar4i>::Gather(const Scalar4i& index) { + //index are grid positions + uint32_t mask = /* sGridSize */ 8 - 1; + // Get grid index (forced within range) mIndex.u4[0] = index.u4[0] & mask; mIndex.u4[1] = index.u4[1] & mask; mIndex.u4[2] = index.u4[2] & mask; mIndex.u4[3] = index.u4[3] & mask; + // true (filled with all ones = -1) when gridSize <= index || index < 0 mOutOfRange.i4[0] = index.u4[0] & ~mask ? 0 : -1; mOutOfRange.i4[1] = index.u4[1] & ~mask ? 0 : -1; mOutOfRange.i4[2] = index.u4[2] & ~mask ? 0 : -1; @@ -84,6 +88,7 @@ Gather<Scalar4i>::Gather(const Scalar4i& index) Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const { + //ptr points to the cone/sphere grid const int32_t* base = ptr->i4; const int32_t* index = mIndex.i4; const int32_t* mask = mOutOfRange.i4; diff --git a/NvCloth/src/sse2/SwCollisionHelpers.h b/NvCloth/src/sse2/SwCollisionHelpers.h index b759868..4882818 100644 --- a/NvCloth/src/sse2/SwCollisionHelpers.h +++ b/NvCloth/src/sse2/SwCollisionHelpers.h @@ -76,22 +76,36 @@ Simd4i horizontalOr(const Simd4i& mask) Gather<Simd4i>::Gather(const Simd4i& index) { - mSelectQ = _mm_srai_epi32(index << 29, 31); - mSelectD = _mm_srai_epi32(index << 30, 31); - mSelectW = _mm_srai_epi32(index << 31, 31); - mOutOfRange = (index ^ sIntSignBit) > sSignedMask; + //index are grid positions + //grid has 8 spaces, grid indices that are within rage use 3 bits (0x7 mask) + mSelectQ = _mm_srai_epi32(index << 29, 31); //expand bit 0x4 to the whole register + mSelectD = _mm_srai_epi32(index << 30, 31); //expand bit 0x2 to the whole register + mSelectW = _mm_srai_epi32(index << 31, 31); //expand the least significant bit (0x1) to the whole register + mOutOfRange = (index ^ sIntSignBit) > sSignedMask; //true if index is outside the grid = (index > 0x7 || index < 0x0) } Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const { + //ptr points to the cone/sphere grid + // more efficient with _mm_shuffle_epi8 (SSSE3) + + //lo contains the bottom 4 grid cells, heigh the top 4 Simd4i lo = ptr[0], hi = ptr[1]; + + //select correct cell in the grid using binary decision tree + // m = select ? a : b + //first bit (even/uneven cells) Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo)); Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo)); Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi)); Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi)); + + //second bit Simd4i m0123 = select(mSelectD, m23, m01); Simd4i m4567 = select(mSelectD, m67, m45); + + // third bit force result zero if it is out of range return select(mSelectQ, m4567, m0123) & ~mOutOfRange; } |