NvCloth 1.1.4 Release. (24070740)

author: Marijn Tamis <[email protected]> 2018-05-03 18:22:48 +0200
committer: Marijn Tamis <[email protected]> 2018-05-03 18:22:48 +0200
commit: ca32c59a58d37c1822e185a2d5f3d0d3e8943593 (patch)
tree: b06b9eec03f34344ef8fc31aa147b2714d3962ee /NvCloth/src
parent: Forced rename of platform folders in cmake dir. Git didn't pick this up before. (diff)
download: nvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.tar.xz
nvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.zip
29 files changed, 768 insertions, 361 deletions
diff --git a/NvCloth/src/ClothBase.h b/NvCloth/src/ClothBase.h
index ec1ee40..faca6c9 100644
--- a/NvCloth/src/ClothBase.h
+++ b/NvCloth/src/ClothBase.h
@@ -64,6 +64,7 @@ void initialize(Cloth& cloth, const physx::PxVec4* pIt, const physx::PxVec4* pEn
 	cloth.mCurrentMotion = physx::PxTransform(physx::PxIdentity);
 	cloth.mLinearVelocity = physx::PxVec3(0.0f);
 	cloth.mAngularVelocity = physx::PxVec3(0.0f);
+	cloth.mIgnoreVelocityDiscontinuityNextFrame = false;
 	cloth.mPrevIterDt = 0.0f;
 	cloth.mIterDtAvg = MovingAverage(30);
 	cloth.mTetherConstraintLogStiffness = float(-FLT_MAX_EXP);
diff --git a/NvCloth/src/ClothImpl.h b/NvCloth/src/ClothImpl.h
index 24f7732..9bc4c36 100644
--- a/NvCloth/src/ClothImpl.h
+++ b/NvCloth/src/ClothImpl.h
@@ -33,6 +33,8 @@
 #include "NvCloth/Fabric.h"
 #include <foundation/PxVec4.h>
 #include <foundation/PxVec3.h>
+#include "IndexPair.h"
+#include "MovingAverage.h"
 #include <PsMathUtils.h>
 #include <cmath>
 
@@ -77,6 +79,9 @@ class ClothImpl : public Cloth
 	virtual void clearInertia();
 
 	virtual void teleport(const physx::PxVec3& delta);
+	virtual void teleportToLocation(const physx::PxVec3& translation, const physx::PxQuat& rotation);
+
+	virtual void ignoreVelocityDiscontinuity();
 
 	virtual float getPreviousIterationDt() const;
 	virtual void setGravity(const physx::PxVec3& gravity);
@@ -210,6 +215,7 @@ public: //Fields shared between all cloth classes
 	physx::PxTransform mCurrentMotion;
 	physx::PxVec3 mLinearVelocity;
 	physx::PxVec3 mAngularVelocity;
+	bool mIgnoreVelocityDiscontinuityNextFrame;
 
 	float mPrevIterDt;
 	MovingAverage mIterDtAvg;
@@ -292,6 +298,21 @@ inline void ClothImpl<T>::teleport(const physx::PxVec3& delta)
 }
 
 template <typename T>
+inline void ClothImpl<T>::teleportToLocation(const physx::PxVec3& translation, const physx::PxQuat& rotation)
+{
+	mCurrentMotion.p = translation;
+	mCurrentMotion.q = rotation;
+	mTargetMotion.p = translation;
+	mTargetMotion.q = rotation;
+}
+
+template <typename T>
+inline void ClothImpl<T>::ignoreVelocityDiscontinuity()
+{
+	mIgnoreVelocityDiscontinuityNextFrame = true;
+}
+
+template <typename T>
 inline float ClothImpl<T>::getPreviousIterationDt() const
 {
 	return mPrevIterDt;
diff --git a/NvCloth/src/IterationState.h b/NvCloth/src/IterationState.h
index e18b636..be046b5 100644
--- a/NvCloth/src/IterationState.h
+++ b/NvCloth/src/IterationState.h
@@ -183,11 +183,15 @@ cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameD
 	mPrevLinearVelocity = cloth.mLinearVelocity;
 	mPrevAngularVelocity = cloth.mAngularVelocity;
 
-	// update cloth
-	float invFrameDt = 1.0f / frameDt;
-	cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p);
-	physx::PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate();
-	cloth.mAngularVelocity = log(dq) * invFrameDt;
+	if(!cloth.mIgnoreVelocityDiscontinuityNextFrame)
+	{
+		// update cloth
+		float invFrameDt = 1.0f / frameDt;
+		cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p);
+		physx::PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate();
+		cloth.mAngularVelocity = log(dq) * invFrameDt;
+	}
+	cloth.mIgnoreVelocityDiscontinuityNextFrame = false;
 
 	cloth.mPrevIterDt = mIterDt;
 	cloth.mIterDtAvg.push(static_cast<uint32_t>(mNumIterations), mIterDt);
diff --git a/NvCloth/src/PhaseConfig.cpp b/NvCloth/src/PhaseConfig.cpp
index 7ea97f3..b5db3b0 100644
--- a/NvCloth/src/PhaseConfig.cpp
+++ b/NvCloth/src/PhaseConfig.cpp
@@ -30,6 +30,7 @@
 #include "NvCloth/PhaseConfig.h"
 #include "PsMathUtils.h"
 #include <algorithm>
+#include "ClothImpl.h"
 
 using namespace physx;
 
@@ -43,14 +44,6 @@ PhaseConfig transform(const PhaseConfig&);
 
 using namespace nv;
 
-namespace
-{
-float safeLog2(float x)
-{
-	float saturated = std::max(0.0f, std::min(x, 1.0f));
-	return saturated ? shdfnd::log2(saturated) : -FLT_MAX_EXP;
-}
-}
 
 // convert from user input to solver format
 cloth::PhaseConfig cloth::transform(const PhaseConfig& config)
diff --git a/NvCloth/src/SwCloth.cpp b/NvCloth/src/SwCloth.cpp
index 6077ada..46a7784 100644
--- a/NvCloth/src/SwCloth.cpp
+++ b/NvCloth/src/SwCloth.cpp
@@ -269,28 +269,13 @@ void SwCloth::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const
 
 	// shuffle indices to form independent SIMD sets
 	uint16_t numParticles = uint16_t(mCurParticles.size());
-	TripletScheduler scheduler(indices);
+	TripletScheduler scheduler(indices); //the TripletScheduler makes a copy so indices is not modified
 	scheduler.simd(numParticles, 4);
 
-	// convert indices to byte offset
-	Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0);
-	Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin();
-	Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end();
-	TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast;
-	mVirtualParticleIndices.resize(0);
-	mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt));
-	for (; sIt != sEnd; ++sIt)
-	{
-		uint32_t setSize = *sIt;
-		for (tLast = tIt + setSize; tIt != tLast; ++tIt, ++mNumVirtualParticles)
-			mVirtualParticleIndices.pushBack(Vec4us(*tIt));
-		mVirtualParticleIndices.resize((mVirtualParticleIndices.size() + 3) & ~3, dummy);
-	}
-	Vector<Vec4us>::Type(mVirtualParticleIndices.begin(), mVirtualParticleIndices.end())
-	    .swap(mVirtualParticleIndices);
+	mVirtualParticleIndices.swap(scheduler.mPaddedTriplets);
 
 	// precompute 1/dot(w,w)
-	Vector<PxVec4>::Type().swap(mVirtualParticleWeights);
+	Vector<PxVec4>::Type().swap(mVirtualParticleWeights); //clear and trim
 	mVirtualParticleWeights.reserve(weights.size());
 	for (; !weights.empty(); weights.popFront())
 	{
diff --git a/NvCloth/src/SwClothData.cpp b/NvCloth/src/SwClothData.cpp
index f102bde..84d85d4 100644
--- a/NvCloth/src/SwClothData.cpp
+++ b/NvCloth/src/SwClothData.cpp
@@ -73,6 +73,7 @@ cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric)
 
 	mTethers = fabric.mTethers.begin();
 	mNumTethers = uint32_t(fabric.mTethers.size());
+	//1-(1 - stiffness)^stiffnessExponent
 	mTetherConstraintStiffness = 1.0f - expf(stiffnessExponent * cloth.mTetherConstraintLogStiffness);
 	mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale;
 
diff --git a/NvCloth/src/SwCollision.cpp b/NvCloth/src/SwCollision.cpp
index 0aa196d..bd6cf7a 100644
--- a/NvCloth/src/SwCollision.cpp
+++ b/NvCloth/src/SwCollision.cpp
@@ -167,7 +167,7 @@ void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres,
 
 		PxVec4 center = (second + first) * 0.5f;
 		PxVec4 axis = (second - first) * 0.5f; //half axis
-		//axiw.w = half of radii difference
+		//axis.w = half of radii difference
 
 		// |Axis|^2
 		float sqrAxisHalfLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
@@ -426,8 +426,8 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt)
 		T4f radius = splat<3>(sphere);
 
 		//calculate the first and last cell index, for each axis, that contains the sphere
-		T4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero));
-		T4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
+		T4i first = intFloor(min(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero), sGridLength)); //use both min and max to deal with bad grid scales
+		T4i last = intFloor(min(max((sphere + radius) * mGridScale + mGridBias, gSimd4fZero), sGridLength));
 
 		const int* firstIdx = array(first);
 		const int* lastIdx = array(last);
@@ -555,40 +555,47 @@ operator &= (const ShapeMask& right)
 	return *this;
 }
 
+// get collision shape masks from a single cell from a single axis of the acceleration structure
 template <typename T4f>
 FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask
 cloth::SwCollision<T4f>::getShapeMask(const T4f& position, const T4i* __restrict sphereGrid,
                                          const T4i* __restrict coneGrid)
 {
+	// position are the grid positions along a single axis (x, y, or z)
 	Gather<T4i> gather(intFloor(position));
 
+	// get the bitmask indicating which cones/spheres overlap the grid cell
 	ShapeMask result;
 	result.mCones = gather(coneGrid);
 	result.mSpheres = gather(sphereGrid);
 	return result;
 }
 
-// lookup acceleration structure and return mask of potential intersectors
+// lookup acceleration structure and return mask of potential intersection collision shapes
 template <typename T4f>
 FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask
 cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict positions) const
 {
+	// positions are the particle positions
 	T4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias);
 	T4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias);
 	T4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias);
 
-	ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid);
-	result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2);
-	result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4);
+	// AND together the bit masks so only the cones/spheres remain
+	//  that overlap with the particle posision on all axis
+	ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid); // X
+	result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2); // Y 
+	result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4); // Z
 
 	return result;
 }
 
-// lookup acceleration structure and return mask of potential intersectors
+// lookup acceleration structure and return mask of potential intersection collision shapes for CCD
 template <typename T4f>
 FORCE_INLINE typename cloth::SwCollision<T4f>::ShapeMask
 cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict prevPos, const T4f* __restrict curPos) const
 {
+	// same as getShapeMask(const T4f* __restrict positions) but for continuous collision detection
 	T4f scaleX = splat<0>(mGridScale);
 	T4f scaleY = splat<1>(mGridScale);
 	T4f scaleZ = splat<2>(mGridScale);
@@ -605,22 +612,24 @@ cloth::SwCollision<T4f>::getShapeMask(const T4f* __restrict prevPos, const T4f*
 	T4f curY = curPos[1] * scaleY + biasY;
 	T4f curZ = curPos[2] * scaleZ + biasZ;
 
+	// get maximum extent corner of the AABB containing both prevPos and curPos
 	T4f maxX = min(max(prevX, curX), sGridLength);
 	T4f maxY = min(max(prevY, curY), sGridLength);
 	T4f maxZ = min(max(prevZ, curZ), sGridLength);
 
-	ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid);
-	result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2);
-	result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4);
+	ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid); // X
+	result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2); // Y
+	result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4); // Z
 
+	// get min extent corner of the AABB containing both prevPos and curPos
 	T4f zero = gSimd4fZero;
 	T4f minX = max(min(prevX, curX), zero);
 	T4f minY = max(min(prevY, curY), zero);
 	T4f minZ = max(min(prevZ, curZ), zero);
 
-	result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6);
-	result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8);
-	result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10);
+	result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6); // X
+	result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8); // Y
+	result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10); // Z
 
 	return result;
 }
@@ -770,7 +779,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
 		T4f axisZ = splat<2>(axis);
 		T4f slope = splat<3>(axis);
 
-		// project delta onto axis
+		// distance along cone axis (from center)
 		T4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
 		// interpolate radius
 		T4f radius = dot * slope + splat<3>(center);
@@ -901,7 +910,7 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
 		T4f dotCurCur = sqrDistance - curRadius * curRadius;
 
 		T4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
-		T4f sqrtD = sqrt(discriminant);
+		T4f sqrtD = sqrt(discriminant); //we get -nan if there are no roots
 		T4f halfB = dotPrevCur - dotPrevPrev;
 		T4f minusA = dotPrevCur - dotCurCur + halfB;
 
@@ -914,7 +923,7 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
 		T4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
 		collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
 
-		// a is negative when one sphere is contained in the other,
+		// a is negative when one relative sphere is contained in the other,
 		// which is already handled by discrete collision.
 		collisionMask = collisionMask & (minusA < -static_cast<T4f>(gSimd4fEpsilon));
 
@@ -931,10 +940,15 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
 			T4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
 			oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
 
+
+			//Move curXYZ to toi point
 			curX = curX + deltaX * oneMinusToi;
 			curY = curY + deltaY * oneMinusToi;
 			curZ = curZ + deltaZ * oneMinusToi;
+			//curXYZ is now touching the sphere at toi
+			//Note that curXYZ is also relative to the sphere center at toi
 
+			//We assume that the point sticks to the sphere until the end of the frame
 			curPos[0] = splat<0>(curSphere) + curX;
 			curPos[1] = splat<1>(curSphere) + curY;
 			curPos[2] = splat<2>(curSphere) + curZ;
@@ -1000,7 +1014,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
 		T4f prevT = prevY * prevAxisZ - prevZ * prevAxisY;
 		T4f prevU = prevZ * prevAxisX - prevX * prevAxisZ;
 		T4f prevV = prevX * prevAxisY - prevY * prevAxisX;
-		T4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+		T4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; //distance along the axis
 		T4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter);
 
 		T4f curCenter = loadAligned(curCenterPtr, offset);
@@ -1014,18 +1028,22 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
 		T4f curX = curPos[0] - splat<0>(curCenter);
 		T4f curY = curPos[1] - splat<1>(curCenter);
 		T4f curZ = curPos[2] - splat<2>(curCenter);
+		//curTUV = cross(curXYZ, curAxisXYZ)
 		T4f curT = curY * curAxisZ - curZ * curAxisY;
 		T4f curU = curZ * curAxisX - curX * curAxisZ;
 		T4f curV = curX * curAxisY - curY * curAxisX;
 		T4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
 		T4f curRadius = curDot * curSlope + splat<3>(curCenter);
 
+		//Magnitude of cross product gives area of parallelogram |curAxisXYZ|*parallelogramHeight, |curAxisXYZ|=1 
+		//parallelogramHeight is distance between the axis and the point
 		T4f curSqrDistance = gSimd4fEpsilon + curT * curT + curU * curU + curV * curV;
 
 		// set radius to zero if cone is culled
 		prevRadius = max(prevRadius, gSimd4fZero) & ~culled;
 		curRadius = max(curRadius, gSimd4fZero) & ~culled;
 
+		//Use quadratic equation to solve for time of impact (against infinite cone)
 		T4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
 		T4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
 		T4f dotCurCur = curSqrDistance - curRadius * curRadius;
@@ -1060,6 +1078,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
 			T4f posY = prevY - deltaY * toi;
 			T4f posZ = prevZ - deltaZ * toi;
 
+			//                                axisHalfLength
 			T4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary));
 			T4i prevAuxiliary = loadAligned(prevAuxiliaryPtr, offset);
 			T4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary));
@@ -1074,19 +1093,26 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
 
 			T4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
 			T4f invHalfLength = rsqrt(sqrHalfLength);
+			// distance along toi cone axis (from center)
 			T4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
 
+			//point line distance
 			T4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
 			T4f invDistance = rsqrt(sqrDistance) & (sqrDistance > gSimd4fZero);
 
+			//offset base to take slope in to account
 			T4f base = dot + slope * sqrDistance * invDistance;
 			T4f scale = base * invHalfLength & collisionMask;
+			// use invHalfLength to map base from [-HalfLength, +HalfLength]=inside to [-1, +1] =inside
 
+			// test if any impact position is in cone section
 			T4f cullMask = (abs(scale) < gSimd4fOne) & collisionMask;
 
 			// test if any impact position is in cone section
 			if (!allEqual(cullMask, gSimd4fZero))
 			{
+				//calculate unnormalized normal delta?
+				//delta = prev - cur - (prevAxis - curScaledAxis)*scale
 				deltaX = deltaX + splat<0>(deltaScaledAxis) * scale;
 				deltaY = deltaY + splat<1>(deltaScaledAxis) * scale;
 				deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale;
@@ -1099,15 +1125,19 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict prevPos, T4f* __rest
 				T4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
 				oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
 
-				curX = curX + deltaX * oneMinusToi;
+				//curX = cur + (prev - cur - (prevAxis - curScaledAxis)*scale)*(1-toi)
+				//curX = cur + (prev - cur)*(1-toi) - ((prevAxis - curScaledAxis)*scale)*(1-toi)
+				curX = curX + deltaX * oneMinusToi; 
 				curY = curY + deltaY * oneMinusToi;
 				curZ = curZ + deltaZ * oneMinusToi;
 
+				//save adjusted values for discrete collision detection below
 				curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
 				curRadius = curDot * curSlope + splat<3>(curCenter);
 				curRadius = max(curRadius, gSimd4fZero) & ~culled;
 				curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
 
+				//take offset from toi and add it to the cone center at the end
 				curPos[0] = splat<0>(curCenter) + curX;
 				curPos[1] = splat<1>(curCenter) + curY;
 				curPos[2] = splat<2>(curCenter) + curZ;
@@ -1210,7 +1240,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
 	T4f nz = deltaZ * rcpDelta;
 
 	// calculate relative velocity
-	// velXYZ is scaled by one over the number of collisions since all collisions accumulate into 
+	// velXYZ is scaled by one over the number of collisions (scale) since all collisions accumulate into 
 	//  that variable during collision detection
 	T4f rvx = curPos[0] - prevPos[0] - velX * scale;
 	T4f rvy = curPos[1] - prevPos[1] - velY * scale;
@@ -1227,7 +1257,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
 	// calculate magnitude of relative tangential velocity
 	T4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon);
 
-	// magnitude of friction impulse (cannot be greater than -vt)
+	// magnitude of friction impulse (cannot be greater than -rvt)
 	T4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, gSimd4fMinusOne) & mask;
 
 	impulse[0] = rvtx * j;
@@ -1244,7 +1274,7 @@ void cloth::SwCollision<T4f>::collideParticles()
 	const T4f massScale = simd4f(mClothData.mCollisionMassScale);
 
 	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
-	const T4f frictionScale = simd4f(mClothData.mFrictionScale); //[arameter set by user
+	const T4f frictionScale = simd4f(mClothData.mFrictionScale); //Parameter set by user
 
 	T4f curPos[4];
 	T4f prevPos[4];
@@ -1439,21 +1469,22 @@ void cloth::SwCollision<T4f>::collideVirtualParticles()
 
 			transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]);
 
-			q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]);
-			q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]);
-			q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]);
-
-			q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]);
-			q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]);
-			q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]);
-
-			q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]);
-			q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]);
-			q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]);
-
-			q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]);
-			q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]);
-			q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]);
+			//            (weight         * frictionImpulse   ) &  mask away if particle is static
+			q0v0 = q0v0 - ((splat<0>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q0v1 = q0v1 - ((splat<1>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q0v2 = q0v2 - ((splat<2>(rw0) * frictionImpulse[0]) & ~(splat<3>(p0v0) == gSimd4fZero));
+						  
+			q1v0 = q1v0 - ((splat<0>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q1v1 = q1v1 - ((splat<1>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q1v2 = q1v2 - ((splat<2>(rw1) * frictionImpulse[1]) & ~(splat<3>(p0v0) == gSimd4fZero));
+						  
+			q2v0 = q2v0 - ((splat<0>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q2v1 = q2v1 - ((splat<1>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q2v2 = q2v2 - ((splat<2>(rw2) * frictionImpulse[2]) & ~(splat<3>(p0v0) == gSimd4fZero));
+						  
+			q3v0 = q3v0 - ((splat<0>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q3v1 = q3v1 - ((splat<1>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero));
+			q3v2 = q3v2 - ((splat<2>(rw3) * frictionImpulse[3]) & ~(splat<3>(p0v0) == gSimd4fZero));
 
 			// write back prev particles
 			storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0);
@@ -1487,6 +1518,7 @@ void cloth::SwCollision<T4f>::collideVirtualParticles()
 			T4f s2 = gSimd4fOne + splat<2>(weightScale) * (w2 & splat<2>(mask));
 			T4f s3 = gSimd4fOne + splat<3>(weightScale) * (w3 & splat<3>(mask));
 
+			//we don't care if particles are static here since we are only scaling (zero for static) mass
 			p0v0 = p0v0 * (gSimd4fOneXYZ | (splat<0>(s0) & sMaskW));
 			p0v1 = p0v1 * (gSimd4fOneXYZ | (splat<1>(s0) & sMaskW));
 			p0v2 = p0v2 * (gSimd4fOneXYZ | (splat<2>(s0) & sMaskW));
@@ -1504,21 +1536,22 @@ void cloth::SwCollision<T4f>::collideVirtualParticles()
 			p3v2 = p3v2 * (gSimd4fOneXYZ | (splat<2>(s3) & sMaskW));
 		}
 
-		p0v0 = p0v0 + (splat<0>(rw0) * d0);
-		p0v1 = p0v1 + (splat<1>(rw0) * d0);
-		p0v2 = p0v2 + (splat<2>(rw0) * d0);
-
-		p1v0 = p1v0 + (splat<0>(rw1) * d1);
-		p1v1 = p1v1 + (splat<1>(rw1) * d1);
-		p1v2 = p1v2 + (splat<2>(rw1) * d1);
-
-		p2v0 = p2v0 + (splat<0>(rw2) * d2);
-		p2v1 = p2v1 + (splat<1>(rw2) * d2);
-		p2v2 = p2v2 + (splat<2>(rw2) * d2);
-
-		p3v0 = p3v0 + (splat<0>(rw3) * d3);
-		p3v1 = p3v1 + (splat<1>(rw3) * d3);
-		p3v2 = p3v2 + (splat<2>(rw3) * d3);
+		//            (weight      * delta) & mask away if particle is static
+		p0v0 = p0v0 + ((splat<0>(rw0) * d0) & ~(splat<3>(p0v0) == gSimd4fZero));
+		p0v1 = p0v1 + ((splat<1>(rw0) * d0) & ~(splat<3>(p0v1) == gSimd4fZero));
+		p0v2 = p0v2 + ((splat<2>(rw0) * d0) & ~(splat<3>(p0v2) == gSimd4fZero));
+																
+		p1v0 = p1v0 + ((splat<0>(rw1) * d1) & ~(splat<3>(p1v0) == gSimd4fZero));
+		p1v1 = p1v1 + ((splat<1>(rw1) * d1) & ~(splat<3>(p1v1) == gSimd4fZero));
+		p1v2 = p1v2 + ((splat<2>(rw1) * d1) & ~(splat<3>(p1v2) == gSimd4fZero));
+																
+		p2v0 = p2v0 + ((splat<0>(rw2) * d2) & ~(splat<3>(p2v0) == gSimd4fZero));
+		p2v1 = p2v1 + ((splat<1>(rw2) * d2) & ~(splat<3>(p2v1) == gSimd4fZero));
+		p2v2 = p2v2 + ((splat<2>(rw2) * d2) & ~(splat<3>(p2v2) == gSimd4fZero));
+																
+		p3v0 = p3v0 + ((splat<0>(rw3) * d3) & ~(splat<3>(p3v0) == gSimd4fZero));
+		p3v1 = p3v1 + ((splat<1>(rw3) * d3) & ~(splat<3>(p3v1) == gSimd4fZero));
+		p3v2 = p3v2 + ((splat<2>(rw3) * d3) & ~(splat<3>(p3v2) == gSimd4fZero));
 
 		// write back particles
 		storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0);
diff --git a/NvCloth/src/SwFabric.cpp b/NvCloth/src/SwFabric.cpp
index 9830398..b8d617f 100644
--- a/NvCloth/src/SwFabric.cpp
+++ b/NvCloth/src/SwFabric.cpp
@@ -162,7 +162,7 @@ uint32_t cloth::SwFabric::getNumStiffnessValues() const
 
 uint32_t cloth::SwFabric::getNumSets() const
 {
-	return uint32_t(mSets.size() - 1);
+	return uint32_t(physx::PxMax(0, static_cast<int>(mSets.size() - 1)));
 }
 
 uint32_t cloth::SwFabric::getNumIndices() const
diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp
index 50be414..bc46ea6 100644
--- a/NvCloth/src/SwInterCollision.cpp
+++ b/NvCloth/src/SwInterCollision.cpp
@@ -49,6 +49,7 @@ const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0));
 const Simd4fScalarFactory sEpsilon = simd4f(FLT_EPSILON);
 const Simd4fTupleFactory sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
 
+// Same as radixSort from SwSelfCollision.cpp but with uint32_t instead of uint16_t
 // returns sorted indices, output needs to be at least 2*(last - first) + 1024
 void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out)
 {
@@ -226,13 +227,15 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
 
 	uint32_t* sortedIndices = static_cast<uint32_t*>(allocator.allocate(numCloths * sizeof(uint32_t)));
 
+	// fill clothBounds, sortedIndices, and calculate totalClothBounds in world space
 	for (uint32_t i = 0; i < numCloths; ++i)
 	{
 		const SwInterCollisionData& c = cBegin[i];
 
-		// transform bounds from b local space to local space of a
+		// grow bounds with the collision distance colDist
 		PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
 		NV_CLOTH_ASSERT(!lcBounds.isEmpty());
+		// transform bounds to world space
 		PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose,lcBounds);
 
 		BoundingBox cBounds = { simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f),
@@ -244,9 +247,11 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
 		totalClothBounds = expandBounds(totalClothBounds, cBounds);
 	}
 
-	// sort indices by their minimum extent on the longest axis
+	// The sweep axis is the longest extent of totalClothBounds
+	// 0 = x axis, 1 = y axis, etc. so that vectors can be indexed using v[sweepAxis]
 	const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower);
 
+	// sort indices by their minimum extent on the sweep axis
 	ClothSorter<T4f> predicate(clothBounds, numCloths, sweepAxis);
 	shdfnd::sort(sortedIndices, numCloths, predicate, nv::cloth::NonTrackingAllocator());
 
@@ -270,10 +275,10 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
 		uint32_t overlapMask = 0;
 		uint32_t numOverlaps = 0;
 
-		// scan back to find first intersecting bounding box
-		uint32_t startIndex = i;
-		while (startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin)
-			--startIndex;
+		// scan forward to skip non intersecting bounds
+		uint32_t startIndex = 0;
+		while(startIndex < numCloths && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] < axisMin)
+			startIndex++;
 
 		// compute all overlapping bounds
 		for (uint32_t j = startIndex; j < numCloths; ++j)
@@ -361,7 +366,7 @@ uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin,
 
 				// output each particle only once
 				++numParticles;
-				break;
+				break; // the particle only has to be inside one of the bounds, it doesn't matter if they are in more than one
 			}
 		}
 	}
@@ -401,7 +406,7 @@ void cloth::SwInterCollision<T4f>::operator()()
 		// world bounds of particles
 		BoundingBox<T4f> bounds = emptyBounds<T4f>();
 
-		// calculate potentially colliding set
+		// calculate potentially colliding set (based on bounding boxes)
 		{
 			NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::BroadPhase", /*ProfileContext::None*/ 0);
 
@@ -415,6 +420,8 @@ void cloth::SwInterCollision<T4f>::operator()()
 		{
 			NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::Collide", /*ProfileContext::None*/ 0);
 
+			//Note: this code is almost the same as cloth::SwSelfCollision<T4f>::operator()
+
 			T4f lowerBound = bounds.mLower;
 			T4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon);
 
@@ -423,11 +430,12 @@ void cloth::SwInterCollision<T4f>::operator()()
 			uint32_t hashAxis0 = (sweepAxis + 1) % 3;
 			uint32_t hashAxis1 = (sweepAxis + 2) % 3;
 
-			// reserve 0, 127, and 65535 for sentinel
+			// reserve 0, 255, and 65535 for sentinel
 			T4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
 			array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
 
 			T4f one = gSimd4fOne;
+			// +1 for sentinel 0 offset
 			T4f gridSize = simd4f(254.0f);
 			array(gridSize)[sweepAxis] = 65534.0f;
 
@@ -564,6 +572,7 @@ size_t cloth::SwInterCollision<T4f>::getBufferSize(uint32_t numParticles)
 template <typename T4f>
 void cloth::SwInterCollision<T4f>::collideParticle(uint32_t index)
 {
+	// The other particle is passed through member variables (mParticle)
 	uint16_t clothIndex = mClothIndices[index];
 
 	if ((1 << clothIndex) & ~mClothMask)
@@ -574,6 +583,8 @@ void cloth::SwInterCollision<T4f>::collideParticle(uint32_t index)
 	uint32_t particleIndex = mParticleIndices[index];
 	T4f& particle = reinterpret_cast<T4f&>(instance->mParticles[particleIndex]);
 
+
+	//very similar to cloth::SwSelfCollision<T4f>::collideParticles
 	T4f diff = particle - mParticle;
 	T4f distSqr = dot3(diff, diff);
 
@@ -609,6 +620,8 @@ void cloth::SwInterCollision<T4f>::collideParticles(const uint32_t* keys, uint32
                                                        const uint32_t* indices, uint32_t numParticles,
                                                        uint32_t collisionDistance)
 {
+	//very similar to cloth::SwSelfCollision<T4f>::collideParticles
+
 	const uint32_t bucketMask = uint16_t(-1);
 
 	const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
@@ -639,8 +652,9 @@ void cloth::SwInterCollision<T4f>::collideParticles(const uint32_t* keys, uint32
 				++kIt;
 			kLast[k] = kIt;
 
-			// jump forward once to second column
-			kIt = keys + firstColumnSize;
+			// jump forward once to second column to go from cell offset 1 to 2 quickly
+			if(firstColumnSize)
+				kIt = keys + firstColumnSize;
 			firstColumnSize = 0;
 		}
 	}
diff --git a/NvCloth/src/SwSelfCollision.cpp b/NvCloth/src/SwSelfCollision.cpp
index ec5a166..f39217b 100644
--- a/NvCloth/src/SwSelfCollision.cpp
+++ b/NvCloth/src/SwSelfCollision.cpp
@@ -45,6 +45,7 @@ namespace
 // returns sorted indices, output needs to be at least 2*(last - first) + 1024
 void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
 {
+	// Note: This function is almost exactly duplicated in SwInterCollision.cpp
 	// this sort uses a radix (bin) size of 256, requiring 4 bins to sort the 32 bit keys
 	uint16_t n = uint16_t(last - first);
 
diff --git a/NvCloth/src/SwSolverKernel.cpp b/NvCloth/src/SwSolverKernel.cpp
index 2181b1e..ab612e2 100644
--- a/NvCloth/src/SwSolverKernel.cpp
+++ b/NvCloth/src/SwSolverKernel.cpp
@@ -276,6 +276,8 @@ void constrainSeparation(T4f* __restrict curIt, const T4f* __restrict curEnd, co
 	}
 }
 
+
+
 /**
     traditional gauss-seidel internal constraint solver
  */
@@ -350,7 +352,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons
 			erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit));
 		}
 
-		//ex = er * stiffness / sqrt(epsilon + vw)
+		//ex = er * stiffness / (epsilon + inMass sum)
 		T4f exij = erij * stij * recip(gSimd4fEpsilon + vwij);
 
 		//h = h * ex
@@ -375,7 +377,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons
 #include "sse2/SwSolveConstraints.h"
 #endif
 
-// calculates upper bound of all position deltas
+// Calculates upper bound of all position deltas
 template <typename T4f>
 T4f calculateMaxDelta(const T4f* prevIt, const T4f* curIt, const T4f* curEnd)
 {
@@ -391,6 +393,11 @@ void applyWind(T4f* __restrict curIt, const T4f* __restrict prevIt, const uint16
                const uint16_t* __restrict tEnd, float itrDtf, float dragCoefficientf, float liftCoefficientf, float fluidDensityf, T4f wind,
                const T4f (&rotation)[3])
 {
+	// Note: Enabling wind can amplify bad behavior since the impulse scales with area,
+	//  and the area of triangles increases when constraints are violated.
+	// Using the initial triangle area based on the rest length is one possible way to
+	//  prevent this, but is expensive (and incorrect for intentionally stretchy cloth).
+
 	const T4f dragCoefficient = simd4f(dragCoefficientf);
 	const T4f liftCoefficient = simd4f(liftCoefficientf);
 	const T4f fluidDensity = simd4f(fluidDensityf);
diff --git a/NvCloth/src/TripletScheduler.cpp b/NvCloth/src/TripletScheduler.cpp
index 999be0e..0116200 100644
--- a/NvCloth/src/TripletScheduler.cpp
+++ b/NvCloth/src/TripletScheduler.cpp
@@ -39,45 +39,132 @@ cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets)
 {
 }
 
-// SSE version
+// helper functions for TripletScheduler::simd()
+namespace
+{
+	//linear search should be fine for small particlesInBatchSize values
+	bool isIndexInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+	{
+		for(int i = 0; i < particlesInBatchSize; i++)
+		{
+			if(particlesInBatch[i] == 0xffffffff)
+				return false;
+			if(index1 == particlesInBatch[i] || index2 == particlesInBatch[i] || index3 == particlesInBatch[i])
+				return true;
+		}
+		return false;
+	}
+
+	void markIndicesInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+	{
+		for(int i = 0; i < particlesInBatchSize - 2; i++)
+		{
+			if(particlesInBatch[i] == 0xffffffff)
+			{
+				NV_CLOTH_ASSERT(i + 2 < particlesInBatchSize);
+				particlesInBatch[i] = index1;
+				particlesInBatch[i + 1] = index2;
+				particlesInBatch[i + 2] = index3;
+				break;
+			}
+		}
+	}
+}
+
+/*
+Group triplets in batches of simdWith so that they can be processed in parallel without referencing the same particle.
+Not suitable for simdWidth with large values due to linear search.
+Note that sets can be larger that simdWidth, and that particles may be referenced from the same set multiple times, 
+	as long as they are not referenced more than once within a batch of simdWidth. The batch resets at the start of 
+	each set.
+*/
 void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth)
 {
-	if (mTriplets.empty())
+	PX_UNUSED(numParticles);
+	if(mTriplets.empty())
 		return;
 
-	Vector<uint32_t>::Type mark(numParticles, uint32_t(-1));
+	const int particlesInBatchSize = simdWidth * 3;
+	uint32_t* particlesInBatch = new uint32_t[particlesInBatchSize]; //used to mark used indices in current batch
+	int setSize = 0; //number of triplets in current set
+	int amountOfPaddingNeeded = 0;
 
-	uint32_t setIndex = 0, setSize = 0;
-	for (TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex)
+	for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; tIt++)
 	{
-		TripletIter tLast = tIt + std::min(simdWidth, uint32_t(tEnd - tIt));
-		TripletIter tSwap = tEnd;
-
-		for (; tIt != tLast && tIt != tSwap; ++tIt, ++setSize)
+		if(setSize%simdWidth == 0)
 		{
-			// swap from tail until independent triplet found
-			while ((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap)
-				std::iter_swap(tIt, tSwap);
+			//reset particlesInBatch if we filled the simd width
+			memset(particlesInBatch, 0xff, sizeof(uint32_t)*particlesInBatchSize);
+		}
 
-			if (tIt == tSwap)
-				break; // no independent triplet found
+		TripletIter tSwap = tIt;
 
-			// mark vertices to be used in simdIndex
-			mark[tIt->x] = setIndex;
-			mark[tIt->y] = setIndex;
-			mark[tIt->z] = setIndex;
+		//Look for the next triplet that doesn't share indices with current batch
+		while(isIndexInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z))
+		{
+			tSwap++;
+			if(tSwap == tEnd)
+				break;
 		}
 
-		if (tIt == tSwap) // remaining triplets depend on current set
+		if(tSwap == tEnd)
 		{
-			if (setSize > simdWidth) // trim set to multiple of simdWidth
-			{
-				uint32_t overflow = setSize % simdWidth;
-				setSize -= overflow;
-				tIt -= overflow;
-			}
+			//all remaining triplets are share indices with the current batch
+
+			//This doesn't make sense, as it will just introduce an extra set with setSize<simdWidth
+			//Should double check though
+			//  // trim set to multiple of simdWidth with the hope that the trimmed triples
+			//  //  will fit in the next set with a multiple of simdWidth size
+			//  if(setSize > (int)simdWidth)
+			//  {
+			//  	uint32_t overflow = setSize % simdWidth;
+			//  	setSize -= overflow;
+			//  	tIt -= overflow;
+			//  }
+
+			//finish set
 			mSetSizes.pushBack(setSize);
+			amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth); //last modulo avoids adding padding when setSize%simdWidth==0
 			setSize = 0;
+			tIt--; //start next set with current triplet
+		}
+		else
+		{
+			//add triplet to set
+			markIndicesInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z);
+			std::iter_swap(tIt, tSwap); //Place triplet at the end of the current set, so that they in sequence in mTriplets
+			setSize++;
+		}
+	}
+	if(setSize)
+	{
+		//finish last set, if we have one
+		mSetSizes.pushBack(setSize);
+		amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth)))% static_cast<int>(simdWidth);
+	}
+
+	// Padding code used to live in SwCloth::setVirtualParticles, now we do it here directly 
+	mPaddedTriplets.reserve(mTriplets.size() + amountOfPaddingNeeded);
+
+	{
+		Vec4us paddingDummy(static_cast<uint16_t>(numParticles), static_cast<uint16_t>(numParticles) + 1, static_cast<uint16_t>(numParticles) + 2, 0);
+
+		TripletIter tIt = mTriplets.begin();
+		//TripletIter tEnd = mTriplets.end();
+		SetIter setIt = mSetSizes.begin();
+		SetIter setEnd = mSetSizes.end();
+		for(; setIt < setEnd; ++setIt)
+		{
+			for(unsigned int i = 0; i < *setIt; i++)
+			{
+				mPaddedTriplets.pushBack(*tIt);
+				++tIt;
+			}
+			unsigned int padding = (static_cast<int>(simdWidth) - (*setIt % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth);
+			for(unsigned int i = 0; i < padding; i++)
+			{
+				mPaddedTriplets.pushBack(paddingDummy);
+			}
 		}
 	}
 }
@@ -95,8 +182,10 @@ struct TripletSet
 	}
 
 	uint32_t mMark; // triplet index
-	uint8_t mNumReplays[3];
-	uint8_t mNumConflicts[3][32];
+
+	// [3] because each particle is fetched on a different instruction.
+	uint8_t mNumReplays[3]; //how many times the instruction needs to be executed to resolve all bank conflicts (= maxElement(mNumConflicts[3][0...31]))
+	uint8_t mNumConflicts[3][32]; //Number of accesses for each of the 32 banks (if it is 1 then there is no conflict on that bank)
 };
 
 /*
@@ -132,83 +221,147 @@ void prefixSum(Iter first, Iter last, Iter dest)
 			*dest = *(dest - 1) + *first;
 	}
 }
-}
 
-// CUDA version
-void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+class AdjacencyQuerier
 {
-	// NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+private:
+	typedef nv::cloth::Vector<uint32_t>::Type UintVector;
+	typedef nv::cloth::Vector<uint32_t>::Type::Iterator UintVectorIterator;
+	typedef nv::cloth::Vector<nv::cloth::Vec4u>::Type::ConstIterator ConstTripletIter;
+	UintVector mAdjacencyIndecies;
+	UintVector mAdjacencies;
+	uint32_t mMaxAdjacentCount;
+
+public:
+	AdjacencyQuerier(nv::cloth::Vector<nv::cloth::Vec4u>::Type const& triplets, int particleCount)
+	{
+		{
+			UintVector& adjacencyCount = mAdjacencyIndecies; //use same memory as mAdjacencyIndecies
+			adjacencyCount.resize(particleCount+1, uint32_t(0)); //initialize count to 0. Size+1 used later for prefixSum/indices
 
-	if (mTriplets.empty())
-		return;
+			// count the number of triplets per particle
+			for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt)
+			{
+				for(int i = 0; i < 3; i++) //for each index in the triplet
+				{
+					const uint32_t particleIndex = (*tIt)[i];
+					adjacencyCount[particleIndex] += 1;
+				}
+			}
+			//adjacentcyCount[i] now holds the number of triplets referring to particle i
 
-	TripletIter tIt, tEnd = mTriplets.end();
-	uint32_t tripletIndex;
+			mMaxAdjacentCount = *physx::shdfnd::maxElement(adjacencyCount.begin(), adjacencyCount.end());
 
-	// count number of triplets per particle
-	Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0));
-	for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt)
-		for (int i = 0; i < 3; ++i)
-			++adjacentCount[(*tIt)[i]];
+			// compute in place prefix sum (inclusive)
+			prefixSum(adjacencyCount.begin(), adjacencyCount.end(), mAdjacencyIndecies.begin());
+			// we use mAdjacencyIndecies from now on
+		}
 
-	/* neither of those were really improving number of batches:
-	// run simd version to pre-sort particles
-	simd(numParticles, blockWidth); mSetSizes.resize(0);
-	// sort according to triplet degree (estimated by sum of adjacentCount)
-	std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount));
-	*/
+		mAdjacencies.resize(mAdjacencyIndecies.back());
+		uint32_t tripletIndex = 0;
+		for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt, ++tripletIndex)
+		{
+			for(int i = 0; i < 3; i++) //for each index in the triplet
+			{
+				const uint32_t particleIndex = (*tIt)[i];
 
-	uint32_t maxTripletCount = *shdfnd::maxElement(adjacentCount.begin(), adjacentCount.end());
+				//Decrement mAdjacencyIndecies here to convert it from inclusive to exclusive prefix sum
+				const uint32_t adjacencyIndex = --mAdjacencyIndecies[particleIndex];
 
-	// compute in place prefix sum (inclusive)
-	prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin());
+				mAdjacencies[adjacencyIndex] = tripletIndex;
+			}
+		}
+		//mAdjacencies[mAdjacencyIndecies[i]] now stores the first triplet index referring to particle i
+		//mAdjacencies[mAdjacencyIndecies[i+1]] stores one past the last
+	}
 
-	// initialize adjacencies (for each particle, collect touching triplets)
-	// also converts partial sum in adjacentCount from inclusive to exclusive
-	Vector<uint32_t>::Type adjacencies(adjacentCount.back());
-	for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
-		for (int i = 0; i < 3; ++i)
-			adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex;
+	uint32_t getMaxAdjacentCount() const
+	{ 
+		return mMaxAdjacentCount;
+	}
+
+	nv::cloth::Range<const uint32_t> getAdjacentTriplets(uint32_t particleIndex) const
+	{
+		//use .begin() + offset to get around bounds check
+		auto begin = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex]);
+		auto end = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex + 1]);
+		return nv::cloth::Range<const uint32_t>(begin, end);
+	}
+};
+
+}
+
+// CUDA version. Doesn't add padding, optimizes for bank conflicts
+void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+	// warpWidth has to be <= 32 and a power of two
+	NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+
+	if (mTriplets.empty())
+		return;
+
+	AdjacencyQuerier adjacencyQuerier(mTriplets, numParticles);
 
 	uint32_t warpMask = warpWidth - 1;
 
-	uint32_t numSets = maxTripletCount; // start with minimum number of sets
+	uint32_t numSets = adjacencyQuerier.getMaxAdjacentCount(); // start with minimum number of sets
 	Vector<TripletSet>::Type sets(numSets);
+
+	// maps triplets to the set index that contains the triplet (setIndex = setIndices[tripletIndex])
 	Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1));
+
 	mSetSizes.resize(numSets);
 
+	uint32_t tripletIndex = 0;
+	TripletIter tIt, tEnd = mTriplets.end();
+
 	// color triplets (assign to sets)
-	Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd;
-	for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+	for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt, ++tripletIndex)
 	{
-		// mark sets of adjacent triplets
+		// mark sets with adjacent triplets
 		for (int i = 0; i < 3; ++i)
 		{
 			uint32_t particleIndex = (*tIt)[i];
-			aIt = aBegin + adjacentCount[particleIndex];
-			aEnd = aBegin + adjacentCount[particleIndex + 1];
-			for (uint32_t setIndex; aIt != aEnd; ++aIt)
-				if (numSets > (setIndex = setIndices[*aIt]))
+			Range<const uint32_t> adjacentTriplets = adjacencyQuerier.getAdjacentTriplets(particleIndex);
+			
+			//for all triplets adjacent to (sharing) particle 'particleIndex'
+			for (; adjacentTriplets.begin() != adjacentTriplets.end(); adjacentTriplets.popFront())
+			{
+				uint32_t setIndex;
+				if(numSets > (setIndex = setIndices[adjacentTriplets.front()]))
+				{
+					//the set 'setIndex' has an adjacent triplet in it (sharing at least one vertex with tripletIndex)
 					sets[setIndex].mMark = tripletIndex;
+				}
+			}
 		}
 
-		// find valid set with smallest number of bank conflicts
+		// find valid set with smallest number of additional replays when adding triplet tIt
 		uint32_t bestIndex = numSets;
-		uint32_t minReplays = 4;
-		for (uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex)
+		uint32_t minAdditionalReplays = 4; // one more than the maximum possible (for 3 particles)
+		for (uint32_t setIndex = 0; setIndex < numSets && minAdditionalReplays; ++setIndex)
 		{
 			const TripletSet& set = sets[setIndex];
 
 			if (set.mMark == tripletIndex)
-				continue; // triplet collision
+				continue; // triplet collision (this set contains an adjacent triplet)
 
+			// count additional replays caused by inserting triplet 'tIt' to in set 'setIndex'
 			uint32_t numReplays = 0;
-			for (uint32_t i = 0; i < 3; ++i)
-				numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]];
+			for(uint32_t i = 0; i < 3; ++i) //for each particle in the triplet
+			{
+				const uint32_t particleIndex = (*tIt)[i];
+				const uint32_t bank = warpMask & particleIndex;
 
-			if (minReplays > numReplays)
+				// an additional replay will only be caused if the additional bank conflict is
+				//  on a bank that is the current bottleneck
+				numReplays += set.mNumReplays[i] == set.mNumConflicts[i][bank];
+			}
+
+			// hold on to this set if it has less additional replays compared to our current best set
+			if (minAdditionalReplays > numReplays)
 			{
-				minReplays = numReplays;
+				minAdditionalReplays = numReplays;
 				bestIndex = setIndex;
 			}
 		}
@@ -221,18 +374,26 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
 			++numSets;
 		}
 
-		// increment bank conflicts or reset if warp filled
+		// increment mNumReplays or reset if warp filled
 		TripletSet& set = sets[bestIndex];
 		if (++mSetSizes[bestIndex] & warpMask)
+		{
 			for (uint32_t i = 0; i < 3; ++i)
-				set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]);
+			{
+				const uint32_t particleIndex = (*tIt)[i];
+				const uint32_t bank = warpMask & particleIndex;
+				set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][bank]);
+			}
+		}
 		else
+		{ 
 			set = TripletSet();
+		}
 
 		setIndices[tripletIndex] = bestIndex;
 	}
 
-	// reorder triplets
+	// reorder triplets so that the sets are in continuous memory
 	Vector<uint32_t>::Type setOffsets(mSetSizes.size());
 	prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin());
 
@@ -243,3 +404,47 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
 
 	mTriplets.swap(triplets);
 }
+
+
+/* Interface used for unit tests */
+cloth::TripletSchedulerTestInterface::TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets)
+	:
+	mScheduler(new cloth::TripletScheduler(triplets))
+{}
+
+cloth::TripletSchedulerTestInterface::~TripletSchedulerTestInterface()
+{
+	delete mScheduler;
+}
+void cloth::TripletSchedulerTestInterface::simd(uint32_t numParticles, uint32_t simdWidth)
+{
+	mScheduler->simd(numParticles, simdWidth);
+}
+void cloth::TripletSchedulerTestInterface::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+	mScheduler->warp(numParticles, warpWidth);
+}
+
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetTriplets()
+{
+	return Range<const uint32_t>(
+		reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin()),
+		reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin() + mScheduler->mTriplets.size())
+		); 
+}
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetSetSizes()
+{
+	return Range<const uint32_t>(
+		mScheduler->mSetSizes.begin(),
+		mScheduler->mSetSizes.begin() + mScheduler->mSetSizes.size()
+		);
+}
+
+NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]> triplets)
+{
+	return new nv::cloth::TripletSchedulerTestInterface(triplets);
+}
+NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface* interface)
+{
+	delete interface;
+}
diff --git a/NvCloth/src/TripletScheduler.h b/NvCloth/src/TripletScheduler.h
index 26119cf..81dc889 100644
--- a/NvCloth/src/TripletScheduler.h
+++ b/NvCloth/src/TripletScheduler.h
@@ -43,6 +43,7 @@ struct TripletScheduler
 {
 	typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter;
 	typedef Vector<Vec4u>::Type::Iterator TripletIter;
+	typedef Vector<uint32_t>::Type::Iterator SetIter;
 
 	TripletScheduler(Range<const uint32_t[4]>);
 	void simd(uint32_t numParticles, uint32_t simdWidth);
@@ -50,6 +51,32 @@ struct TripletScheduler
 
 	Vector<Vec4u>::Type mTriplets;
 	Vector<uint32_t>::Type mSetSizes;
+	Vector<Vec4us>::Type mPaddedTriplets;
 };
 }
 }
+
+//Make TripletScheduler available for the unit tests.
+namespace nv
+{
+namespace cloth
+{
+struct NV_CLOTH_IMPORT TripletSchedulerTestInterface
+{
+private:
+	TripletScheduler* mScheduler;
+
+public:
+	TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets);
+	~TripletSchedulerTestInterface();
+	void simd(uint32_t numParticles, uint32_t simdWidth);
+	void warp(uint32_t numParticles, uint32_t warpWidth);
+
+	Range<const uint32_t> GetTriplets();
+	Range<const uint32_t> GetSetSizes();
+};
+}
+}
+
+NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]>);
+NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface*);
diff --git a/NvCloth/src/cuda/CuCloth.h b/NvCloth/src/cuda/CuCloth.h
index 8dc3082..f7d8268 100644
--- a/NvCloth/src/cuda/CuCloth.h
+++ b/NvCloth/src/cuda/CuCloth.h
@@ -131,7 +131,7 @@ class CuCloth : protected CuContextLock, public ClothImpl<CuCloth>
 	uint32_t getSharedMemorySize() const; // without particle data
 
 	// expects transformed configs, doesn't call notifyChanged()
-	void setPhaseConfigInternal(Range<const PhaseConfig>);
+	void setPhaseConfigInternal(Range<const PhaseConfig> configs);
 
 	Range<physx::PxVec4> push(CuConstraints&);
 	void clear(CuConstraints&);
diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h
index aeb2bda..f9b69f7 100644
--- a/NvCloth/src/cuda/CuCollision.h
+++ b/NvCloth/src/cuda/CuCollision.h
@@ -1160,9 +1160,10 @@ __device__ void apply(ParticleDataT& particleData, const int4& indices, const fl
 	posX.y += delta.y * weights.x; posY.y += delta.y * weights.y; posZ.y += delta.y * weights.z;
 	posX.z += delta.z * weights.x; posY.z += delta.z * weights.y; posZ.z += delta.z * weights.z;
 
-	particleData(indices.x) = posX;
-	particleData(indices.y) = posY;
-	particleData(indices.z) = posZ;
+	//if(particle is not static)        store new position
+	if(particleData(indices.x,3)!= 0) particleData(indices.x) = posX;
+	if(particleData(indices.y,3)!= 0) particleData(indices.y) = posY;
+	if(particleData(indices.z,3)!= 0) particleData(indices.z) = posZ;
 }
 #else
 template <typename PointerT>
diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp
index 957f912..6794fa5 100644
--- a/NvCloth/src/cuda/CuFabric.cpp
+++ b/NvCloth/src/cuda/CuFabric.cpp
@@ -165,7 +165,8 @@ uint32_t cloth::CuFabric::getNumStiffnessValues() const
 
 uint32_t cloth::CuFabric::getNumSets() const
 {
-	return uint32_t(mSets.size() - 1);
+	return uint32_t(physx::PxMax(0, (int)mSets.size() - 1));
+
 }
 
 uint32_t cloth::CuFabric::getNumIndices() const
diff --git a/NvCloth/src/cuda/CuFactory.cpp b/NvCloth/src/cuda/CuFactory.cpp
index 397262c..df52dcd 100644
--- a/NvCloth/src/cuda/CuFactory.cpp
+++ b/NvCloth/src/cuda/CuFactory.cpp
@@ -97,6 +97,7 @@ void cloth::checkSuccessImpl(CUresult err, const char* file, const int line)
 			ADD_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
 			ADD_CASE(CUDA_ERROR_LAUNCH_TIMEOUT);
 			ADD_CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING);
+			ADD_CASE(CUDA_ERROR_ILLEGAL_ADDRESS);
 		default:
 			ADD_CASE(CUDA_ERROR_UNKNOWN);
 #undef ADD_CASE
@@ -115,8 +116,9 @@ uint32_t getMaxThreadsPerBlock(CUcontext context)
 	CUdevice device;
 	checkSuccess(cuCtxGetDevice(&device));
 
-	int major = 0;
+	int major = 0, minor = 0;
 	checkSuccess(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+	checkSuccess(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
 
 	checkSuccess(cuCtxPopCurrent(nullptr));
 
@@ -149,7 +151,7 @@ cloth::Fabric* cloth::CuFactory::createFabric(uint32_t numParticles, Range<const
                                               Range<const float> tetherLengths, Range<const uint32_t> triangles)
 {
 	return NV_CLOTH_NEW(CuFabric)(*this, numParticles, phaseIndices, sets, restvalues, stiffnessValues, indices, anchors, tetherLengths, triangles,
-	                    getNextFabricId());
+								  getNextFabricId());
 }
 
 cloth::Cloth* cloth::CuFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
@@ -210,6 +212,7 @@ void cloth::CuFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 
 	if (!sets.empty())
 	{
+		// need to skip copying the first element
 		NV_CLOTH_ASSERT(sets.size() == cuFabric.mSets.size() - 1);
 		const uint32_t* deviceSets = cuFabric.mSets.begin().get();
 		copyToHost(deviceSets + 1, deviceSets + cuFabric.mSets.size(), sets.begin());
diff --git a/NvCloth/src/cuda/CuSolverKernelBlob.h b/NvCloth/src/cuda/CuSolverKernelBlob.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/NvCloth/src/cuda/CuSolverKernelBlob.h
diff --git a/NvCloth/src/dx/DxCloth.cpp b/NvCloth/src/dx/DxCloth.cpp
index 68c3289..0b1521b 100644
--- a/NvCloth/src/dx/DxCloth.cpp
+++ b/NvCloth/src/dx/DxCloth.cpp
@@ -84,9 +84,9 @@ cloth::DxCloth::DxCloth(DxFactory& factory, DxFabric& fabric, Range<const PxVec4
 , mTargetCollisionPlanes(mFactory.mCollisionPlanes)
 , mStartCollisionTriangles(mFactory.mCollisionTriangles)
 , mTargetCollisionTriangles(mFactory.mCollisionTriangles)
-, mVirtualParticleSetSizes(mFactory.mContextManager)
-, mVirtualParticleIndices(mFactory.mContextManager)
-, mVirtualParticleWeights(mFactory.mContextManager)
+, mVirtualParticleSetSizes(mFactory.mVirtualParticleSetSizes)
+, mVirtualParticleIndices(mFactory.mVirtualParticleIndices)
+, mVirtualParticleWeights(mFactory.mVirtualParticleWeights)
 , mRestPositions(mFactory.mRestPositions)
 , mSelfCollisionIndices(mFactory.mSelfCollisionIndices)
 , mSelfCollisionParticles(mFactory.mSelfCollisionParticles)
diff --git a/NvCloth/src/dx/DxCloth.h b/NvCloth/src/dx/DxCloth.h
index 5e783fa..3a04734 100644
--- a/NvCloth/src/dx/DxCloth.h
+++ b/NvCloth/src/dx/DxCloth.h
@@ -91,11 +91,9 @@ class DxCloth : protected DxContextLock, public ClothImpl<DxCloth>
 	typedef DxFabric FabricType;
 	typedef DxContextLock ContextLockType;
 
-	template <typename>
-	struct MapTraits;
-
 	typedef DxVectorMap<DxBatchedVector<physx::PxVec3> > MappedVec3fVectorType;
 	typedef DxVectorMap<DxBatchedVector<physx::PxVec4> > MappedVec4fVectorType;
+	typedef DxVectorMap<DxBatchedVector<Vec4us> > MappedVec4usVectorType;
 	typedef DxVectorMap<DxBatchedVector<IndexPair> > MappedIndexVectorType;
 	typedef DxVectorMap<DxBatchedVector<uint32_t> > MappedMaskVectorType;
 
@@ -190,9 +188,9 @@ class DxCloth : protected DxContextLock, public ClothImpl<DxCloth>
 	float mFriction;
 
 	// virtual particles
-	DxDeviceVector<uint32_t> mVirtualParticleSetSizes;
-	DxDeviceVector<Vec4us> mVirtualParticleIndices;
-	DxDeviceVector<physx::PxVec4> mVirtualParticleWeights;
+	DxBatchedVector<uint32_t> mVirtualParticleSetSizes;
+	DxBatchedVector<Vec4us> mVirtualParticleIndices;
+	DxBatchedVector<physx::PxVec4> mVirtualParticleWeights;
 
 	// self collision
 	float mSelfCollisionDistance;
diff --git a/NvCloth/src/dx/DxClothData.cpp b/NvCloth/src/dx/DxClothData.cpp
index 57049bf..562ce7a 100644
--- a/NvCloth/src/dx/DxClothData.cpp
+++ b/NvCloth/src/dx/DxClothData.cpp
@@ -71,6 +71,8 @@ cloth::DxClothData::DxClothData(DxCloth& cloth)
 
 	mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3;
 
+	mNumVirtualParticleSetSizes = cloth.mVirtualParticleSetSizes.size();
+
 	mEnableContinuousCollision = cloth.mEnableContinuousCollision;
 	mCollisionMassScale = cloth.mCollisionMassScale;
 	mFrictionScale = cloth.mFriction;
diff --git a/NvCloth/src/dx/DxClothData.h b/NvCloth/src/dx/DxClothData.h
index 4da9be2..855a3c5 100644
--- a/NvCloth/src/dx/DxClothData.h
+++ b/NvCloth/src/dx/DxClothData.h
@@ -46,7 +46,7 @@ typedef unsigned int uint32_t;
 typedef int int32_t;
 #endif
 
-static const uint32_t MaxParticlesInSharedMem = 1971;
+static const uint32_t MaxParticlesInSharedMem = 1969;
 
 
 struct DxPhaseConfig
@@ -124,6 +124,8 @@ struct DxClothData
 
 	uint32_t mNumCollisionTriangles;
 
+	uint32_t mNumVirtualParticleSetSizes;
+
 	uint32_t mEnableContinuousCollision; //bool stored in uint32_t for dx alignment
 	float mCollisionMassScale;
 	float mFrictionScale;
diff --git a/NvCloth/src/dx/DxFabric.cpp b/NvCloth/src/dx/DxFabric.cpp
index 56b2b0a..cf6865a 100644
--- a/NvCloth/src/dx/DxFabric.cpp
+++ b/NvCloth/src/dx/DxFabric.cpp
@@ -62,30 +62,33 @@ cloth::DxFabric::DxFabric(DxFactory& factory, uint32_t numParticles, Range<const
 	NV_CLOTH_ASSERT(restvalues.size() == stiffnessValues.size() || stiffnessValues.size() == 0);
 	NV_CLOTH_ASSERT(mNumParticles > *shdfnd::maxElement(indices.begin(), indices.end()));
 
-	// manually convert uint32_t indices to uint16_t in temp memory
-	Vector<DxConstraint>::Type hostConstraints;
-	hostConstraints.resizeUninitialized(restvalues.size());
-	Vector<DxConstraint>::Type::Iterator cIt = hostConstraints.begin();
-	Vector<DxConstraint>::Type::Iterator cEnd = hostConstraints.end();
-
-	const uint32_t* iIt = indices.begin();
-	const float* rIt = restvalues.begin();
-	for (; cIt != cEnd; ++cIt)
+	//constraints
 	{
-		cIt->mRestvalue = *rIt++;
-		cIt->mFirstIndex = uint16_t(*iIt++);
-		cIt->mSecondIndex = uint16_t(*iIt++);
-	}
-
-	// copy to device vector in one go
+		// manually convert uint32_t indices to uint16_t in temp memory
+		Vector<DxConstraint>::Type hostConstraints;
+		hostConstraints.resizeUninitialized(restvalues.size());
+		Vector<DxConstraint>::Type::Iterator cIt = hostConstraints.begin();
+		Vector<DxConstraint>::Type::Iterator cEnd = hostConstraints.end();
+
+		const uint32_t* iIt = indices.begin();
+		const float* rIt = restvalues.begin();
+		for(; cIt != cEnd; ++cIt)
+		{
+			cIt->mRestvalue = *rIt++;
+			cIt->mFirstIndex = uint16_t(*iIt++);
+			cIt->mSecondIndex = uint16_t(*iIt++);
+		}
+
+		// copy to device vector in one go
 #if 0
 	// Workaround for NvAPI SCG device updateSubresource size limit
-	mConstraintsHostCopy.assign(hostConstraints.begin(), hostConstraints.end());
-	mConstraints.resize(mConstraintsHostCopy.size());
-	mConstraints = mConstraintsHostCopy;
+		mConstraintsHostCopy.assign(hostConstraints.begin(), hostConstraints.end());
+		mConstraints.resize(mConstraintsHostCopy.size());
+		mConstraints = mConstraintsHostCopy;
 #else
-	mConstraints.assign(hostConstraints.begin(), hostConstraints.end());
+		mConstraints.assign(hostConstraints.begin(), hostConstraints.end());
 #endif
+	}
 
 	mStiffnessValues.assign(stiffnessValues.begin(), stiffnessValues.end());
 
diff --git a/NvCloth/src/dx/DxFactory.cpp b/NvCloth/src/dx/DxFactory.cpp
index 91f5125..7298ffe 100644
--- a/NvCloth/src/dx/DxFactory.cpp
+++ b/NvCloth/src/dx/DxFactory.cpp
@@ -96,6 +96,12 @@ typedef Vec4T<uint32_t> Vec4u;
 		, mCollisionPlanesDeviceCopy(mContextManager)
 		, mCollisionTriangles(mContextManager, DxStagingBufferPolicy())
 		, mCollisionTrianglesDeviceCopy(mContextManager)
+		, mVirtualParticleSetSizes(mContextManager, DxStagingBufferPolicy())
+		, mVirtualParticleSetSizesDeviceCopy(mContextManager)
+		, mVirtualParticleIndices(mContextManager, DxStagingBufferPolicy())
+		, mVirtualParticleIndicesDeviceCopy(mContextManager)
+		, mVirtualParticleWeights(mContextManager, DxStagingBufferPolicy())
+		, mVirtualParticleWeightsDeviceCopy(mContextManager)
 		, mMotionConstraints(mContextManager)
 		, mSeparationConstraints(mContextManager)
 		, mRestPositions(mContextManager, DxStagingBufferPolicy())
@@ -205,7 +211,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 
 	if (!sets.empty())
 	{
-		// need to skip copying the first element
+		// we don't skip first element here
 		NV_CLOTH_ASSERT(sets.size() == dxFabric.mSets.size());
 		memcpy(sets.begin(), dxFabric.mSets.begin(), sets.size() * sizeof(uint32_t));
 	}
@@ -390,8 +396,11 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 			uint32_t numWeights = cloth.getNumVirtualParticleWeights();
 
 			Vector<PxVec4>::Type hostWeights(numWeights, PxVec4(0.0f));
-			copyToHost(hostWeights.begin(), dxCloth.mVirtualParticleWeights.mBuffer.mBuffer, 0,
-				hostWeights.size() * sizeof(PxVec4));
+			//copyToHost(hostWeights.begin(), dxCloth.mVirtualParticleWeights.mBuffer.mBuffer, 0,
+			//	hostWeights.size() * sizeof(PxVec4));
+			NV_CLOTH_ASSERT(hostWeights.size() == dxCloth.mVirtualParticleWeights.size());
+			intrinsics::memCopy(hostWeights.begin(), DxCloth::MappedVec4fVectorType(const_cast<DxCloth&>(dxCloth).mVirtualParticleWeights).begin(),
+				destIndices.size() * sizeof(uint32_t));
 
 			// convert weights to Vec3f
 			PxVec3* destIt = reinterpret_cast<PxVec3*>(destWeights.begin());
@@ -408,8 +417,11 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 			uint32_t numIndices = cloth.getNumVirtualParticles();
 
 			Vector<Vec4us>::Type hostIndices(numIndices);
-			copyToHost(hostIndices.begin(), dxCloth.mVirtualParticleIndices.mBuffer.mBuffer, 0,
-				hostIndices.size() * sizeof(Vec4us));
+			//copyToHost(hostIndices.begin(), dxCloth.mVirtualParticleIndices.mBuffer.mBuffer, 0,
+			//	hostIndices.size() * sizeof(Vec4us));
+			NV_CLOTH_ASSERT(hostIndices.size() == dxCloth.mVirtualParticleIndices.size());
+			intrinsics::memCopy(hostIndices.begin(), DxCloth::MappedVec4usVectorType(const_cast<DxCloth&>(dxCloth).mVirtualParticleIndices).begin(),
+				destIndices.size() * sizeof(uint32_t));
 
 			// convert indices to 32 bit
 			Vec4u* destIt = reinterpret_cast<Vec4u*>(destIndices.begin());
diff --git a/NvCloth/src/dx/DxFactory.h b/NvCloth/src/dx/DxFactory.h
index 23d5382..d5f985f 100644
--- a/NvCloth/src/dx/DxFactory.h
+++ b/NvCloth/src/dx/DxFactory.h
@@ -35,6 +35,7 @@
 #include "../IndexPair.h"
 #include <foundation/PxVec4.h>
 #include <foundation/PxVec3.h>
+#include "Vec4T.h"
 
 #if _MSC_VER >= 1700
 #pragma warning(disable : 4471)
@@ -81,8 +82,8 @@ protected:
 
 	virtual Cloth* clone(const Cloth& cloth);
 
-	virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phaseIndices, Range<uint32_t> sets,
-	                               Range<float> restvalues, Range<float> stiffnessValues, Range<uint32_t> indices, Range<uint32_t> anchors,
+	virtual void extractFabricData( const Fabric& fabric, Range<uint32_t> phaseIndices, Range<uint32_t> sets,
+	                                Range<float> restvalues, Range<float> stiffnessValues, Range<uint32_t> indices, Range<uint32_t> anchors,
 	                               Range<float> tetherLengths, Range<uint32_t> triangles) const;
 
 	virtual void extractCollisionData(const Cloth& cloth, Range<physx::PxVec4> spheres, Range<uint32_t> capsules,
@@ -145,6 +146,12 @@ protected:
 
 	DxBatchedStorage<physx::PxVec3> mCollisionTriangles;
 	DxBuffer<physx::PxVec3> mCollisionTrianglesDeviceCopy;
+	DxBatchedStorage<uint32_t> mVirtualParticleSetSizes;
+	DxBuffer<uint32_t> mVirtualParticleSetSizesDeviceCopy;
+	DxBatchedStorage<Vec4us> mVirtualParticleIndices;
+	DxBuffer<Vec4us> mVirtualParticleIndicesDeviceCopy;
+	DxBatchedStorage<physx::PxVec4> mVirtualParticleWeights;
+	DxBuffer<physx::PxVec4> mVirtualParticleWeightsDeviceCopy;
 
 	DxBatchedStorage<physx::PxVec4> mMotionConstraints;
 	DxBatchedStorage<physx::PxVec4> mSeparationConstraints;
@@ -157,6 +164,7 @@ protected:
 	DxBatchedStorage<uint32_t> mSelfCollisionData;
 
 	DxBatchedStorage<uint32_t> mTriangles;
+
 };
 }
 }
diff --git a/NvCloth/src/dx/DxSolver.cpp b/NvCloth/src/dx/DxSolver.cpp
index 66a8d8f..21caa0b 100644
--- a/NvCloth/src/dx/DxSolver.cpp
+++ b/NvCloth/src/dx/DxSolver.cpp
@@ -350,6 +350,9 @@ void cloth::DxSolver::beginFrame()
 	mFactory.mConvexMasksDeviceCopy = mFactory.mConvexMasks.mBuffer;
 	mFactory.mCollisionPlanesDeviceCopy = mFactory.mCollisionPlanes.mBuffer;
 	mFactory.mCollisionTrianglesDeviceCopy = mFactory.mCollisionTriangles.mBuffer;
+	mFactory.mVirtualParticleSetSizesDeviceCopy = mFactory.mVirtualParticleSetSizes.mBuffer;
+	mFactory.mVirtualParticleIndicesDeviceCopy = mFactory.mVirtualParticleIndices.mBuffer;
+	mFactory.mVirtualParticleWeightsDeviceCopy = mFactory.mVirtualParticleWeights.mBuffer;
 //	mFactory.mParticleAccelerations = mFactory.mParticleAccelerationsHostCopy;
 	mFactory.mRestPositionsDeviceCopy = mFactory.mRestPositions.mBuffer;
 }
@@ -369,7 +372,8 @@ void cloth::DxSolver::executeKernel()
 	{
 		context->CSSetShader(mFactory.mSolverKernelComputeShader, NULL, 0);
 
-		ID3D11ShaderResourceView* resourceViews[18] = {
+		// Set shader StructuredBuffer registers t0-t20
+		ID3D11ShaderResourceView* resourceViews[21] = {
 			mClothData.mBuffer.resourceView(), /*mFrameData.mBuffer.resourceView()*/NULL,
 			mIterationData.mBuffer.resourceView(), mFactory.mPhaseConfigs.mBuffer.resourceView(),
 			mFactory.mConstraints.mBuffer.resourceView(), mFactory.mTethers.mBuffer.resourceView(),
@@ -382,10 +386,15 @@ void cloth::DxSolver::executeKernel()
 			mFactory.mRestPositionsDeviceCopy.resourceView(),
 			mFactory.mSelfCollisionIndices.mBuffer.resourceView(),
 			mFactory.mStiffnessValues.mBuffer.resourceView(),
-			mFactory.mTriangles.mBuffer.resourceView()
+			mFactory.mTriangles.mBuffer.resourceView(),
+
+			mFactory.mVirtualParticleSetSizesDeviceCopy.resourceView(),
+			mFactory.mVirtualParticleIndicesDeviceCopy.resourceView(),
+			mFactory.mVirtualParticleWeightsDeviceCopy.resourceView()
 		};
-		context->CSSetShaderResources(0, 18, resourceViews);
+		context->CSSetShaderResources(0, 21, resourceViews);
 
+		// Set shader RWStructuredBuffer registers u0-u4
 		ID3D11UnorderedAccessView* accessViews[4] = {
 			mFactory.mParticles.mBuffer.accessViewRaw(),
 			mFactory.mSelfCollisionParticles.mBuffer.accessView(),
@@ -398,10 +407,10 @@ void cloth::DxSolver::executeKernel()
 
 		context->CSSetShader(NULL, NULL, 0);
 
-		ID3D11ShaderResourceView* resourceViewsNULL[17] = {
-			NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+		ID3D11ShaderResourceView* resourceViewsNULL[21] = {
+			NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
 		};
-		context->CSSetShaderResources(0, 17, resourceViewsNULL);
+		context->CSSetShaderResources(0, 21, resourceViewsNULL);
 		ID3D11UnorderedAccessView* accessViewsNULL[4] = { NULL, NULL, NULL, NULL };
 		context->CSSetUnorderedAccessViews(0, 4, accessViewsNULL, NULL);
 #if 0
diff --git a/NvCloth/src/dx/DxSolverKernel.hlsl b/NvCloth/src/dx/DxSolverKernel.hlsl
index 4d91f4b..9c02468 100644
--- a/NvCloth/src/dx/DxSolverKernel.hlsl
+++ b/NvCloth/src/dx/DxSolverKernel.hlsl
@@ -77,6 +77,10 @@ StructuredBuffer<float> bPerConstraintStiffness : register(t16);
 //Note that the buffer actually contains uint16_t values
 StructuredBuffer<uint32_t> bTriangles : register(t17);
 
+StructuredBuffer<int32_t> bVirtualParticleSetSizes : register(t18);
+StructuredBuffer<uint4>   bVirtualParticleIndices : register(t19);
+StructuredBuffer<float4>  bVirtualParticleWeights : register(t20);
+
 groupshared DxClothData gClothData;
 groupshared DxFrameData gFrameData;
 groupshared DxIterationData gIterData;
@@ -864,7 +868,7 @@ void collideCapsules(IParticles curParticles, IParticles prevParticles, uint32_t
 
 static const float gSkeletonWidth = (1.f - 0.2f) * (1.f - 0.2f) - 1.f;
 
-uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevAlpha, out float3 outDelta, out float3 outVelocity)
+uint32_t collideCapsules(float3 curPosAbsolute, float3 prevPosAbsolute, float alpha, float prevAlpha, out float3 outDelta, out float3 outVelocity)
 {
 	outDelta = float3(0.0f, 0.0f, 0.0f);
 	outVelocity = float3(0.0f, 0.0f, 0.0f);
@@ -881,87 +885,87 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
 	{
 		IndexPair indices = bCapsuleIndices[capsuleOffset + i];
 
-		// current
+		// load sphere data
 		float4 startSphere0 = bCollisionSpheres[startSphereOffset + indices.first];
 		float4 targetSphere0 = bCollisionSpheres[targetSphereOffset + indices.first];
 
 		float4 startSphere1 = bCollisionSpheres[startSphereOffset + indices.second];
 		float4 targetSphere1 = bCollisionSpheres[targetSphereOffset + indices.second];
 
-		// prev
+		// interpolate sphere data to end of previous iteration
 		float4 prevSphere0 = lerp(startSphere0, targetSphere0, prevAlpha);
 		float4 prevSphere1 = lerp(startSphere1, targetSphere1, prevAlpha);
 
 		prevSphere0.w = max(prevSphere0.w, 0.0f);
 		prevSphere1.w = max(prevSphere1.w, 0.0f);
 
-		float4 prevAxis = (prevSphere1 - prevSphere0) * 0.5f;
 		float3 prevConeCenter = (prevSphere1.xyz + prevSphere0.xyz) * 0.5f;
+		float4 prevHalfAxis = (prevSphere1 - prevSphere0) * 0.5f; //half axis
+		//axis.w = half of radii difference
 
-		float3 prevDelta = prevPos - prevConeCenter;
+		float3 prevPos = prevPosAbsolute - prevConeCenter;
 
-		float prevSqrAxisLength = dot(prevAxis.xyz, prevAxis.xyz);
-		float prevSqrConeLength = prevSqrAxisLength - prevAxis.w * prevAxis.w;
+		float prevSqrAxisHalfLength = dot(prevHalfAxis.xyz, prevHalfAxis.xyz);
+		float prevSqrConeHalfLength = prevSqrAxisHalfLength - prevHalfAxis.w * prevHalfAxis.w;
 
-		if (prevSqrAxisLength <= 0.0f)
+		if (prevSqrConeHalfLength <= 0.0f)
 			continue;
 
-		float prevInvAxisLength = rsqrt(prevSqrAxisLength);
-		float prevInvConeLength = rsqrt(prevSqrConeLength);
+		float prevInvAxisHalfLength = rsqrt(prevSqrAxisHalfLength);
+		float prevInvConeHalfLength = rsqrt(prevSqrConeHalfLength);
 
-		float prevAxisLength = prevSqrAxisLength * prevInvAxisLength;
+		float prevAxisHalfLength = prevSqrAxisHalfLength * prevInvAxisHalfLength;
 
-		float prevConeRadius = (prevAxis.w + prevSphere0.w) * prevInvConeLength * prevAxisLength;
+		float prevConeRadius = (prevHalfAxis.w + prevSphere0.w) * prevInvConeHalfLength * prevAxisHalfLength;
 
-		float3 prevConeAxis = prevAxis.xyz * prevInvAxisLength;
-		float prevConeSlope = prevAxis.w * prevInvConeLength;
+		float3 prevConeAxis = prevHalfAxis.xyz * prevInvAxisHalfLength;
+		float prevConeSlope = prevHalfAxis.w * prevInvConeHalfLength;
 
-		float3 prevCross = cross(prevDelta, prevConeAxis);
+		float3 prevCross = cross(prevPos, prevConeAxis);
 		float prevDot = dot(prevPos, prevConeAxis);
 
-		// current
-		float4 sphere0 = lerp(startSphere0, targetSphere0, alpha);
-		float4 sphere1 = lerp(startSphere1, targetSphere1, alpha);
+		// interpolate sphere data to end of current iteration
+		float4 curSphere0 = lerp(startSphere0, targetSphere0, alpha);
+		float4 curSphere1 = lerp(startSphere1, targetSphere1, alpha);
 
-		sphere0.w = max(sphere0.w, 0.0f);
-		sphere1.w = max(sphere1.w, 0.0f);
+		curSphere0.w = max(curSphere0.w, 0.0f);
+		curSphere1.w = max(curSphere1.w, 0.0f);
 
-		float4 curAxis = (sphere1 - sphere0) * 0.5f;
-		float3 curConeCenter = (sphere1.xyz + sphere0.xyz) * 0.5f;
+		float3 curConeCenter = (curSphere1.xyz + curSphere0.xyz) * 0.5f;
+		float4 curHalfAxis = (curSphere1 - curSphere0) * 0.5f; //half axis
+		//axis.w = half of radii difference
 
-		float3 curDelta = curPos - curConeCenter;
+		float3 curPos = curPosAbsolute - curConeCenter;
 
-		float sqrAxisLength = dot(curAxis.xyz, curAxis.xyz);
-		float sqrConeLength = sqrAxisLength - curAxis.w * curAxis.w;
+		float curSqrAxisHalfLength = dot(curHalfAxis.xyz, curHalfAxis.xyz);
+		float curSqrConeHalfLength = curSqrAxisHalfLength - curHalfAxis.w * curHalfAxis.w;
 
-		if (sqrConeLength <= 0.0f)
+		if(curSqrConeHalfLength <= 0.0f)
 			continue;
 
-		float invAxisLength = rsqrt(sqrAxisLength);
-		float invConeLength = rsqrt(sqrConeLength);
+		float curInvAxisHalfLength = rsqrt(curSqrAxisHalfLength);
+		float curInvConeHalfLength = rsqrt(curSqrConeHalfLength);
 
-		float axisLength = sqrAxisLength * invAxisLength;
+		float curAxisHalfLength = curSqrAxisHalfLength * curInvAxisHalfLength;
 
-		float3 coneCenter = (sphere1.xyz + sphere0.xyz) * 0.5f;
-		float coneRadius = (curAxis.w + sphere0.w) * invConeLength * axisLength;
+		float curConeRadius = (curHalfAxis.w + curSphere0.w) * curInvConeHalfLength * curAxisHalfLength;
 
-		float3 curConeAxis = curAxis.xyz * invAxisLength;
+		float3 curConeAxis = curHalfAxis.xyz * curInvAxisHalfLength;
 
-		float3 curCross = cross(curDelta, curConeAxis);
+		float3 curCross = cross(curPos, curConeAxis);
 		float curDot = dot(curPos, curConeAxis);
 
 		float curSqrDistance = FLT_EPSILON + dot(curCross, curCross);
 
-		float prevRadius = max(prevDot * prevConeSlope + coneRadius, 0.0f);
+		float prevRadius = max(prevDot * prevConeSlope + prevConeRadius, 0.0f);
 
-		float curSlope = curAxis.w * invConeLength;
-		float curRadius = max(curDot * curSlope + coneRadius, 0.0f);
+		float curConeSlope = curHalfAxis.w * curInvConeHalfLength;
+		float curRadius = max(curDot * curConeSlope + curConeRadius, 0.0f);
 
-		float sine = curAxis.w * invAxisLength;
-		float coneSqrCosine = 1.f - sine * sine;
-		float curHalfLength = axisLength;
+		float sine = curHalfAxis.w * curInvAxisHalfLength;
+		float coneSqrCosine = 1.f - (sine * sine);
 
-		float dotPrevPrev = dot(prevCross, prevCross) - prevCross.x * prevCross.x - prevRadius * prevRadius;
+		float dotPrevPrev = dot(prevCross, prevCross) - prevRadius * prevRadius;
 		float dotPrevCur = dot(prevCross, curCross) - prevRadius * curRadius;
 		float dotCurCur = curSqrDistance - curRadius * curRadius;
 
@@ -972,7 +976,7 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
 
 		// time of impact or 0 if prevPos inside cone
 		float toi = min(0.0f, halfB + sqrtD) / minusA;
-		bool hasCollision = toi < 1.0f && halfB < sqrtD;
+		bool hasCollision = (toi < 1.0f) && (halfB < sqrtD);
 
 		// skip continuous collision if the (un-clamped) particle
 		// trajectory only touches the outer skin of the cone.
@@ -990,46 +994,29 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
 			// interpolate delta at toi
 			float3 pos = prevPos - delta * toi;
 
-	//		float axisLength = sqrAxisLength * invAxisLength;
-
-	//		float curHalfLength = axisLength; // ?
-			float3 curConeAxis = curAxis.xyz * invAxisLength;
-			float3 curScaledAxis = curAxis.xyz * curHalfLength;
-
-			float4 prevAxis = (prevSphere1 - prevSphere0) * 0.5f;
-			float3 prevConeCenter = (prevSphere1.xyz + prevSphere0.xyz) * 0.5f;
-
-			float prevSqrAxisLength = dot(prevAxis.xyz, prevAxis.xyz);
-			float prevSqrConeLength = prevSqrAxisLength - prevAxis.w * prevAxis.w;
-
-			float prevInvAxisLength = rsqrt(prevSqrAxisLength);
-			float prevInvConeLength = rsqrt(prevSqrConeLength);
-
-			float prevAxisLength = prevSqrAxisLength * prevInvAxisLength;
-			float prevConeRadius = (prevAxis.w + prevSphere0.w) * prevInvConeLength * prevAxisLength;
-
-			float3 prevConeAxis = prevAxis.xyz * prevInvAxisLength;
-
-			float prevHalfLength = prevAxisLength;
-
-			float3 deltaScaledAxis = curScaledAxis - prevAxis.xyz * prevHalfLength;
+			float3 curScaledAxis = curConeAxis.xyz * curAxisHalfLength;
+			float3 deltaScaledAxis = curScaledAxis - prevConeAxis * prevAxisHalfLength;
 
 			float oneMinusToi = 1.0f - toi;
 
 			// interpolate axis at toi
 			float3 axis = curScaledAxis - deltaScaledAxis * oneMinusToi;
 
-			float slope = prevConeSlope * oneMinusToi + curSlope * toi;
+			float slope = prevConeSlope * oneMinusToi + curConeSlope * toi;
 
-			float sqrHalfLength = dot(axis, axis); // axisX * axisX + axisY * axisY + axisZ * axisZ;
+			float sqrHalfLength = dot(axis, axis);
 			float invHalfLength = rsqrt(sqrHalfLength);
+			// distance along toi cone axis (from center)
 			float dotf = dot(pos, axis) * invHalfLength;
 
+			//point line distance
 			float sqrDistance = dot(pos, pos) - dotf * dotf;
 			float invDistance = sqrDistance > 0.0f ? rsqrt(sqrDistance) : 0.0f;
 
+			//offset base to take slope in to account
 			float base = dotf + slope * sqrDistance * invDistance;
 			float scale = base * invHalfLength;
+			// use invHalfLength to map base from [-HalfLength, +HalfLength]=inside to [-1, +1] =inside
 
 			if (abs(scale) < 1.0f)
 			{
@@ -1037,99 +1024,49 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
 
 				// reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
 				// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
-				float minusK = sqrtD / (minusA * oneMinusToi);
+				float minusK = (oneMinusToi > FLT_EPSILON) ? sqrtD / (minusA * oneMinusToi) : 0.0f;
 				oneMinusToi = oneMinusToi / (1.f - minusK);
 
-				curDelta += delta * oneMinusToi;
-
-				curDot = dot(curDelta, curAxis.xyz);
-				float curConeRadius = (curAxis.w + sphere0.w) * invConeLength * axisLength;
+				curPos += delta * oneMinusToi;
 
-				curRadius = max(curDot * curSlope + curConeRadius, 0.0f);    // Duplicate?
-				curSqrDistance = dot(curDelta, curDelta) - curDot * curDot;
+				curDot = dot(curPos, curConeAxis.xyz);
+				curRadius = max(curDot * curConeSlope + curConeRadius, 0.0f);
+				curSqrDistance = dot(curPos, curPos) - curDot * curDot;
 
-				curPos = coneCenter + curDelta;
+				//curPos is relative to coneCenter so we don't need to do this:
+				//curPos = coneCenter + curPos;
 			}
 		}
-
-		{
-			float3 delta = curPos - coneCenter;
-		
-			float deltaDotAxis = dot(delta, curConeAxis);
-			float radius = max(deltaDotAxis * curSlope + coneRadius, 0.0f);
-			float sqrDistance = dot(delta, delta) - deltaDotAxis * deltaDotAxis;
 		
-			if (sqrDistance > radius * radius)
+		{		
+			if (curSqrDistance > curRadius * curRadius)
 				continue;
 		
-			sqrDistance = max(sqrDistance, FLT_EPSILON);
-			float invDistance = rsqrt(sqrDistance);
+			curSqrDistance = max(curSqrDistance, FLT_EPSILON);
+			float invDistance = rsqrt(curSqrDistance);
 		
-			float base = deltaDotAxis + curSlope * sqrDistance * invDistance;
-			float halfLength = axisLength;
-		//	float halfLength = coneHalfLength;
+			float base = curDot + curConeSlope * curSqrDistance * invDistance;
+			float halfLength = curAxisHalfLength;
 		
 			if (abs(base) < halfLength)
 			{
-				delta = delta - base * curAxis;
+				float3 delta = curPos - base * curConeAxis;
 		
 				float sqrCosine = coneSqrCosine;
-				float scale = radius * invDistance * sqrCosine - sqrCosine;
+				float scale = curRadius * invDistance * sqrCosine - sqrCosine;
 		
-				outDelta += delta * scale;
+				outDelta += delta *scale;
 		
 				if (frictionEnabled)
 				{
-					// get previous sphere pos
-					float4 prevSphere0 = lerp(startSphere0, targetSphere0, prevAlpha);
-					float4 prevSphere1 = lerp(startSphere1, targetSphere1, prevAlpha);
-		
 					// interpolate velocity between the two spheres
-					float t = deltaDotAxis * 0.5f + 0.5f;
-					outVelocity += lerp(sphere0.xyz - prevSphere0.xyz, sphere1.xyz - prevSphere1.xyz, t);
+					float t = curDot * 0.5f + 0.5f;
+					outVelocity += lerp(curSphere0.xyz - prevSphere0.xyz, curSphere1.xyz - prevSphere1.xyz, t);
 				}
 		
 				++numCollisions;
 			}
 		}
-
-		// curPos inside cone (discrete collision)
-		bool hasContact = curRadius * curRadius > curSqrDistance;
-
-		if (!hasContact)
-			continue;
-
-		float invDistance = curSqrDistance > 0.0f ? rsqrt(curSqrDistance) : 0.0f;
-		float base = curDot + curSlope * curSqrDistance * invDistance;
-
-	//	float axisLength = sqrAxisLength * invAxisLength;
-
-	//	float halfLength = axisLength; // ?
-	//	float halfLength = coneHalfLength;
-		/*
-		if (abs(base) < halfLength)
-		{
-			float3 delta = curPos - base * curAxis.xyz;
-
-			float sine = axis.w * invAxisLength; // Remove?
-			float sqrCosine = 1.f - sine * sine;
-
-			float scale = curRadius * invDistance * coneSqrCosine - coneSqrCosine;
-
-
-	//		delta += de
-
-			outDelta += delta * scale;
-			
-			if (frictionEnabled)
-			{
-				// interpolate velocity between the two spheres
-				float t = curDot * 0.5f + 0.5f;
-				outVelocity += lerp(sphere0.xyz - prevSphere0.xyz, sphere1.xyz - prevSphere1.xyz, t);
-			}
-
-			++numCollisions;
-		}*/
 	}
 
 	// sphere collision
@@ -1147,8 +1084,8 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
 		float prevRadius = prevSphere.w;
 
 		{
-			float3 curDelta = curPos - sphere.xyz;
-			float3 prevDelta = prevPos - prevSphere.xyz;
+			float3 curDelta = curPosAbsolute - sphere.xyz;
+			float3 prevDelta = prevPosAbsolute - prevSphere.xyz;
 
 			float sqrDistance = FLT_EPSILON + dot(curDelta, curDelta);
 
@@ -1186,7 +1123,7 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
 				oneMinusToi = oneMinusToi / (1.f - minusK);
 
 				curDelta += delta * oneMinusToi;
-				curPos = sphere.xyz + curDelta;
+				curPosAbsolute = sphere.xyz + curDelta;
 
 				sqrDistance = FLT_EPSILON + dot(curDelta, curDelta);
 			}
@@ -1212,6 +1149,125 @@ uint32_t collideCapsules(float3 curPos, float3 prevPos, float alpha, float prevA
 	return numCollisions;
 }
 
+float3 barycentricLerp(float3 pA, float3 pB, float3 pC, float3 weights)
+{
+	float3 outP;
+	outP.x = pA.x * weights.x + pB.x * weights.y + pC.x * weights.z;
+	outP.y = pA.y * weights.x + pB.y * weights.y + pC.y * weights.z;
+	outP.z = pA.z * weights.x + pB.z * weights.y + pC.z * weights.z;
+	return outP;
+}
+
+void applyVirtualImpulse(IParticles particles, uint3 indices, float3 weights, float3 delta, float scale)
+//apply
+{
+	float4 pA = particles.get(indices.x);
+	float4 pB = particles.get(indices.y);
+	float4 pC = particles.get(indices.z);
+
+	delta *= scale;
+
+	pA.x += delta.x * weights.x;	pB.x += delta.x * weights.y;	pC.x += delta.x * weights.z;
+	pA.y += delta.y * weights.x;	pB.y += delta.y * weights.y;	pC.y += delta.y * weights.z;
+	pA.z += delta.z * weights.x;	pB.z += delta.z * weights.y;	pC.z += delta.z * weights.z;
+
+	//if(particle is not static) store new position
+	if(pA.w != 0.0f) particles.set(indices.x, pA);
+	if(pB.w != 0.0f) particles.set(indices.y, pB);
+	if(pC.w != 0.0f) particles.set(indices.z, pC);
+}
+
+void collideVirtualCapsules(IParticles curParticles, IParticles prevParticles, uint32_t threadIdx, float alpha, float prevAlpha)
+{
+	uint virtualParticleSetSizesCount = gClothData.mNumVirtualParticleSetSizes;
+	
+	if(virtualParticleSetSizesCount == 0)
+		return;
+
+	bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
+	bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f;
+
+	if(gClothData.mEnableContinuousCollision)
+	{
+		//???
+	}
+
+	uint indicesEnd = 0;
+	for(uint vpSetSizeIt = 0; vpSetSizeIt < virtualParticleSetSizesCount; vpSetSizeIt++)
+	{
+		GroupMemoryBarrierWithGroupSync();
+		uint indicesIt = indicesEnd + threadIdx * 4;
+		for(indicesEnd += bVirtualParticleSetSizes[vpSetSizeIt]*4; indicesIt < indicesEnd; indicesIt += blockDim * 4)
+		{
+			uint4 indices;
+			{
+				uint4 tmp = bVirtualParticleIndices[indicesIt>>1];
+				if(indicesIt & 1 == 1)
+				{
+					indices.x = tmp.z & 0xffff;
+					indices.y = tmp.z >> 16;
+					indices.z = tmp.w & 0xffff;
+					indices.w = tmp.w >> 16;
+				}
+				else
+				{
+					indices.x = tmp.x & 0xffff;
+					indices.y = tmp.x >> 16;
+					indices.z = tmp.y & 0xffff;
+					indices.w = tmp.y >> 16;
+				}
+			}
+
+			float4 weights = bVirtualParticleWeights[indices.w];
+
+			float3 curPos = barycentricLerp(curParticles.get(indices.x).xyz, curParticles.get(indices.y).xyz, curParticles.get(indices.z).xyz, weights.xyz);
+
+			float3 delta, velocity;
+			uint32_t numCollisions = collideCapsules(curPos.xyz, alpha, prevAlpha, delta, velocity);
+			if(numCollisions > 0)
+			{
+				float scale = 1.0f / (float)numCollisions;
+				float wscale = weights.w * scale;
+
+				applyVirtualImpulse(curParticles, indices.xyz, weights.xyz, delta, scale);
+
+				if(frictionEnabled)
+				{
+					float3 prevPos = barycentricLerp(prevParticles.get(indices.x).xyz, prevParticles.get(indices.y).xyz, prevParticles.get(indices.z).xyz, weights.xyz);
+					
+					float3 frictionImpulse =
+						calcFrictionImpulse(prevPos.xyz, curPos.xyz, velocity, scale, delta);
+
+					applyVirtualImpulse(prevParticles, indices.xyz, weights.xyz, frictionImpulse, -weights.w);
+				}
+
+				if(massScaleEnabled)
+				{
+					float deltaLengthSq = dot(delta.xyz, delta.xyz) * scale * scale;
+					float invMassScale = 1.0f / (1.0f + gClothData.mCollisionMassScale * deltaLengthSq);
+
+					// not multiplying by weights[3] here because unlike applying velocity
+					// deltas where we want the interpolated position to obtain a particular
+					// value, we instead just require that the total change is equal to invMassScale
+					invMassScale = invMassScale - 1.0f;
+
+					float4 pA = curParticles.get(indices.x);
+					float4 pB = curParticles.get(indices.y);
+					float4 pC = curParticles.get(indices.z);
+
+					pA.w *= 1.0f + weights.x * invMassScale;
+					pB.w *= 1.0f + weights.y * invMassScale;
+					pC.w *= 1.0f + weights.z * invMassScale;
+
+					curParticles.set(indices.x,pA); 
+					curParticles.set(indices.y,pB); 
+					curParticles.set(indices.z,pC);
+				}
+			}
+		}
+	}
+}
+
 void collideContinuousCapsules(IParticles curParticles, IParticles prevParticles, uint32_t threadIdx, float alpha, float prevAlpha)
 {
 	bool frictionEnabled = gClothData.mFrictionScale > 0.0f;
@@ -1261,6 +1317,7 @@ void collideParticles(IParticles curParticles, IParticles prevParticles, uint32_
 		collideContinuousCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha);
 	else
 		collideCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha);
+	collideVirtualCapsules(curParticles, prevParticles, threadIdx, alpha, prevAlpha);
 }
 
 void constrainSeparation(IParticles curParticles, uint32_t threadIdx, float alpha)
diff --git a/NvCloth/src/scalar/SwCollisionHelpers.h b/NvCloth/src/scalar/SwCollisionHelpers.h
index 3ab756f..c86d939 100644
--- a/NvCloth/src/scalar/SwCollisionHelpers.h
+++ b/NvCloth/src/scalar/SwCollisionHelpers.h
@@ -69,13 +69,17 @@ struct Gather<Scalar4i>
 
 Gather<Scalar4i>::Gather(const Scalar4i& index)
 {
+	//index are grid positions
+
 	uint32_t mask = /* sGridSize */ 8 - 1;
 
+	// Get grid index (forced within range)
 	mIndex.u4[0] = index.u4[0] & mask;
 	mIndex.u4[1] = index.u4[1] & mask;
 	mIndex.u4[2] = index.u4[2] & mask;
 	mIndex.u4[3] = index.u4[3] & mask;
 
+	// true (filled with all ones = -1) when gridSize <= index || index < 0
 	mOutOfRange.i4[0] = index.u4[0] & ~mask ? 0 : -1;
 	mOutOfRange.i4[1] = index.u4[1] & ~mask ? 0 : -1;
 	mOutOfRange.i4[2] = index.u4[2] & ~mask ? 0 : -1;
@@ -84,6 +88,7 @@ Gather<Scalar4i>::Gather(const Scalar4i& index)
 
 Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const
 {
+	//ptr points to the cone/sphere grid
 	const int32_t* base = ptr->i4;
 	const int32_t* index = mIndex.i4;
 	const int32_t* mask = mOutOfRange.i4;
diff --git a/NvCloth/src/sse2/SwCollisionHelpers.h b/NvCloth/src/sse2/SwCollisionHelpers.h
index b759868..4882818 100644
--- a/NvCloth/src/sse2/SwCollisionHelpers.h
+++ b/NvCloth/src/sse2/SwCollisionHelpers.h
@@ -76,22 +76,36 @@ Simd4i horizontalOr(const Simd4i& mask)
 
 Gather<Simd4i>::Gather(const Simd4i& index)
 {
-	mSelectQ = _mm_srai_epi32(index << 29, 31);
-	mSelectD = _mm_srai_epi32(index << 30, 31);
-	mSelectW = _mm_srai_epi32(index << 31, 31);
-	mOutOfRange = (index ^ sIntSignBit) > sSignedMask;
+	//index are grid positions
+	//grid has 8 spaces, grid indices that are within rage use 3 bits (0x7 mask)
+	mSelectQ = _mm_srai_epi32(index << 29, 31); //expand bit 0x4 to the whole register
+	mSelectD = _mm_srai_epi32(index << 30, 31); //expand bit 0x2 to the whole register
+	mSelectW = _mm_srai_epi32(index << 31, 31); //expand the least significant bit (0x1) to the whole register
+	mOutOfRange = (index ^ sIntSignBit) > sSignedMask; //true if index is outside the grid = (index > 0x7 || index < 0x0)
 }
 
 Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
 {
+	//ptr points to the cone/sphere grid
+
 	// more efficient with _mm_shuffle_epi8 (SSSE3)
+
+	//lo contains the bottom 4 grid cells, heigh the top 4
 	Simd4i lo = ptr[0], hi = ptr[1];
+
+	//select correct cell in the grid using binary decision tree
+	//      m  =         select ?     a        :    b
+	//first bit (even/uneven cells)
 	Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo));
 	Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo));
 	Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi));
 	Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi));
+
+	//second bit
 	Simd4i m0123 = select(mSelectD, m23, m01);
 	Simd4i m4567 = select(mSelectD, m67, m45);
+
+	// third bit							force result zero if it is out of range
 	return select(mSelectQ, m4567, m0123) & ~mOutOfRange;
 }
author	Marijn Tamis <[email protected]>	2018-05-03 18:22:48 +0200
committer	Marijn Tamis <[email protected]>	2018-05-03 18:22:48 +0200
commit	ca32c59a58d37c1822e185a2d5f3d0d3e8943593 (patch)
tree	b06b9eec03f34344ef8fc31aa147b2714d3962ee /NvCloth/src
parent	Forced rename of platform folders in cmake dir. Git didn't pick this up before. (diff)
download	nvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.tar.xz nvcloth-ca32c59a58d37c1822e185a2d5f3d0d3e8943593.zip