26 files changed, 404 insertions, 236 deletions
diff --git a/NvCloth/src/BoundingBox.h b/NvCloth/src/BoundingBox.h
index 74bc0ff..ea84d52 100644
--- a/NvCloth/src/BoundingBox.h
+++ b/NvCloth/src/BoundingBox.h
@@ -37,26 +37,26 @@ namespace nv
 namespace cloth
 {
 
-template <typename Simd4f>
+template <typename T4f>
 struct BoundingBox
 {
-	Simd4f mLower;
-	Simd4f mUpper;
+	T4f mLower;
+	T4f mUpper;
 };
 
-template <typename Simd4f>
-inline BoundingBox<Simd4f> loadBounds(const float* ptr)
+template <typename T4f>
+inline BoundingBox<T4f> loadBounds(const float* ptr)
 {
-	BoundingBox<Simd4f> result;
+	BoundingBox<T4f> result;
 	result.mLower = load(ptr);
 	result.mUpper = load(ptr + 3);
 	return result;
 }
 
-template <typename Simd4f>
-inline BoundingBox<Simd4f> emptyBounds()
+template <typename T4f>
+inline BoundingBox<T4f> emptyBounds()
 {
-	BoundingBox<Simd4f> result;
+	BoundingBox<T4f> result;
 
 	result.mLower = gSimd4fFloatMax;
 	result.mUpper = -result.mLower;
@@ -64,10 +64,10 @@ inline BoundingBox<Simd4f> emptyBounds()
 	return result;
 }
 
-template <typename Simd4f>
-inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd)
+template <typename T4f>
+inline BoundingBox<T4f> expandBounds(const BoundingBox<T4f>& bounds, const T4f* pIt, const T4f* pEnd)
 {
-	BoundingBox<Simd4f> result = bounds;
+	BoundingBox<T4f> result = bounds;
 	for (; pIt != pEnd; ++pIt)
 	{
 		result.mLower = min(result.mLower, *pIt);
@@ -76,26 +76,26 @@ inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const
 	return result;
 }
 
-template <typename Simd4f>
-inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+template <typename T4f>
+inline BoundingBox<T4f> expandBounds(const BoundingBox<T4f>& a, const BoundingBox<T4f>& b)
 {
-	BoundingBox<Simd4f> result;
+	BoundingBox<T4f> result;
 	result.mLower = min(a.mLower, b.mLower);
 	result.mUpper = max(a.mUpper, b.mUpper);
 	return result;
 }
 
-template <typename Simd4f>
-inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+template <typename T4f>
+inline BoundingBox<T4f> intersectBounds(const BoundingBox<T4f>& a, const BoundingBox<T4f>& b)
 {
-	BoundingBox<Simd4f> result;
+	BoundingBox<T4f> result;
 	result.mLower = max(a.mLower, b.mLower);
 	result.mUpper = min(a.mUpper, b.mUpper);
 	return result;
 }
 
-template <typename Simd4f>
-inline bool isEmptyBounds(const BoundingBox<Simd4f>& a)
+template <typename T4f>
+inline bool isEmptyBounds(const BoundingBox<T4f>& a)
 {
 	return anyGreater(a.mLower, a.mUpper) != 0;
 }
diff --git a/NvCloth/src/ClothClone.h b/NvCloth/src/ClothClone.h
index 386fee6..7145da5 100644
--- a/NvCloth/src/ClothClone.h
+++ b/NvCloth/src/ClothClone.h
@@ -29,12 +29,12 @@
 
 #pragma once
 
-#include "../SwFactory.h"
-#include "../SwFabric.h"
-#include "../SwCloth.h"
+#include "SwFactory.h"
+#include "SwFabric.h"
+#include "SwCloth.h"
 
-#include "../ClothImpl.h"
-#include "../ClothBase.h"
+#include "ClothImpl.h"
+#include "ClothBase.h"
 #include "NvCloth/Allocator.h"
 
 namespace nv
diff --git a/NvCloth/src/ClothImpl.h b/NvCloth/src/ClothImpl.h
index 4d7b28d..24f7732 100644
--- a/NvCloth/src/ClothImpl.h
+++ b/NvCloth/src/ClothImpl.h
@@ -1220,7 +1220,7 @@ inline float ClothImpl<T>::getLiftCoefficient() const
 template <typename T>
 inline void ClothImpl<T>::setFluidDensity(float fluidDensity)
 {
-	NV_CLOTH_ASSERT(fluidDensity < 0.f);
+	NV_CLOTH_ASSERT(fluidDensity > 0.f);
 	if (fluidDensity == mFluidDensity)
 		return;
 
diff --git a/NvCloth/src/IterationState.h b/NvCloth/src/IterationState.h
index 224e87e..e18b636 100644
--- a/NvCloth/src/IterationState.h
+++ b/NvCloth/src/IterationState.h
@@ -72,21 +72,21 @@ inline physx::PxQuat exp(const physx::PxVec3& v)
 	return physx::PxQuat(v.x * scale, v.y * scale, v.z * scale, physx::PxCos(theta));
 }
 
-template <typename Simd4f, uint32_t N>
-inline void assign(Simd4f (&columns)[N], const physx::PxMat44& matrix)
+template <typename T4f, uint32_t N>
+inline void assign(T4f (&columns)[N], const physx::PxMat44& matrix)
 {
 	for (uint32_t i = 0; i < N; ++i)
 		columns[i] = load(nv::cloth::array(matrix[i]));
 }
 
-template <typename Simd4f>
-inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec)
+template <typename T4f>
+inline T4f transform(const T4f (&columns)[3], const T4f& vec)
 {
 	return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
 }
 
-template <typename Simd4f>
-inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec)
+template <typename T4f>
+inline T4f transform(const T4f (&columns)[3], const T4f& translate, const T4f& vec)
 {
 	return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
 }
@@ -99,17 +99,17 @@ struct IterationStateFactory
 	template <typename MyCloth>
 	IterationStateFactory(MyCloth& cloth, float frameDt);
 
-	template <typename Simd4f, typename MyCloth>
-	IterationState<Simd4f> create(MyCloth const& cloth) const;
+	template <typename T4f, typename MyCloth>
+	IterationState<T4f> create(MyCloth const& cloth) const;
 
-	template <typename Simd4f>
-	static Simd4f lengthSqr(Simd4f const& v)
+	template <typename T4f>
+	static T4f lengthSqr(T4f const& v)
 	{
 		return dot3(v, v);
 	}
 
-	template <typename Simd4f>
-	static physx::PxVec3 castToPxVec3(const Simd4f& v)
+	template <typename T4f>
+	static physx::PxVec3 castToPxVec3(const T4f& v)
 	{
 		return *reinterpret_cast<const physx::PxVec3*>(reinterpret_cast<const char*>(&v));
 	}
@@ -123,7 +123,7 @@ struct IterationStateFactory
 };
 
 /* solver iterations helper functor */
-template <typename Simd4f>
+template <typename T4f>
 struct IterationState
 {
 	// call after each iteration
@@ -133,15 +133,15 @@ struct IterationState
 	inline float getPreviousAlpha() const;
 
   public:
-	Simd4f mRotationMatrix[3]; // should rename to 'mRotation'
+	T4f mRotationMatrix[3]; // should rename to 'mRotation'
 
-	Simd4f mCurBias;  // in local space
-	Simd4f mPrevBias; // in local space
-	Simd4f mWind;     // delta position per iteration (wind velocity * mIterDt)
+	T4f mCurBias;  // in local space
+	T4f mPrevBias; // in local space
+	T4f mWind;     // delta position per iteration (wind velocity * mIterDt)
 
-	Simd4f mPrevMatrix[3];
-	Simd4f mCurMatrix[3];
-	Simd4f mDampScaleUpdate;
+	T4f mPrevMatrix[3];
+	T4f mCurMatrix[3];
+	T4f mDampScaleUpdate;
 
 	// iteration counter
 	uint32_t mRemainingIterations;
@@ -157,14 +157,14 @@ struct IterationState
 
 } // namespace cloth
 
-template <typename Simd4f>
-inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const
+template <typename T4f>
+inline float cloth::IterationState<T4f>::getCurrentAlpha() const
 {
 	return getPreviousAlpha() + mInvNumIterations;
 }
 
-template <typename Simd4f>
-inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const
+template <typename T4f>
+inline float cloth::IterationState<T4f>::getPreviousAlpha() const
 {
 	return 1.0f - mRemainingIterations * mInvNumIterations;
 }
@@ -232,36 +232,36 @@ If you change anything in this function, make sure that ClothCustomFloating and
 ClothInertia haven't regressed for any choice of solver frequency.
 */
 
-template <typename Simd4f, typename MyCloth>
-cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const
+template <typename T4f, typename MyCloth>
+cloth::IterationState<T4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const
 {
-	IterationState<Simd4f> result;
+	IterationState<T4f> result;
 
 	result.mRemainingIterations = static_cast<uint32_t>(mNumIterations);
 	result.mInvNumIterations = mInvNumIterations;
 	result.mIterDt = mIterDt;
 
-	Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity));
-	Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity));
+	T4f curLinearVelocity = load(array(cloth.mLinearVelocity));
+	T4f prevLinearVelocity = load(array(mPrevLinearVelocity));
 
-	Simd4f iterDt = simd4f(mIterDt);
-	Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt;
+	T4f iterDt = simd4f(mIterDt);
+	T4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt;
 
-	Simd4f translation = iterDt * curLinearVelocity;
+	T4f translation = iterDt * curLinearVelocity;
 
 	// gravity delta per iteration
-	Simd4f gravity = load(array(cloth.mGravity)) * static_cast<Simd4f>(simd4f(sqr(mIterDtAverage)));
+	T4f gravity = load(array(cloth.mGravity)) * static_cast<T4f>(simd4f(sqr(mIterDtAverage)));
 
 	// scale of local particle velocity per iteration
-	Simd4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent);
+	T4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent);
 	// adjust for the change in time step during the first iteration
-	Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio);
+	T4f firstDampScale = dampScale * simd4f(mIterDtRatio);
 
 	// portion of negative frame velocity to transfer to particle
-	Simd4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation;
+	T4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation;
 
 	// portion of frame acceleration to transfer to particle
-	Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity);
+	T4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity);
 
 	// for inertia, we want to violate newton physics to
 	// match velocity and position as given by the user, which means:
@@ -271,13 +271,13 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
 	// specifically, the portion is alpha=(n+1)/2n and 1-alpha.
 
 	float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations;
-	Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha);
+	T4f curLinearInertia = linearInertia * simd4f(linearAlpha);
 
 	// rotate to local space (use mRotationMatrix temporarily to hold matrix)
 	physx::PxMat44 invRotation = physx::PxMat44(mCurrentRotation.getConjugate());
 	assign(result.mRotationMatrix, invRotation);
 
-	Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+	T4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
 
 	// Previously, we split the bias between previous and current position to
 	// get correct disretized position and velocity. However, this made a
@@ -286,23 +286,23 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
 	// timesteps. Instead, we now apply the entire bias to current position
 	// and accept a less noticeable error for a free falling cloth.
 
-	Simd4f bias = gravity - linearDrag;
+	T4f bias = gravity - linearDrag;
 	result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ;
 	result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ;
 
-	Simd4f wind = load(array(cloth.mWind)) * iterDt; // multiply with delta time here already so we don't have to do it inside the solver
+	T4f wind = load(array(cloth.mWind)) * iterDt; // multiply with delta time here already so we don't have to do it inside the solver
 	result.mWind = transform(result.mRotationMatrix, translation - wind) & maskXYZ;
 
 	result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f;
 
 	if (result.mIsTurning)
 	{
-		Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity)));
-		Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity)));
+		T4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity)));
+		T4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity)));
 
 		// rotation for one iteration in local space
-		Simd4f curInvAngle = -iterDt * curAngularVelocity;
-		Simd4f prevInvAngle = -iterDt * prevAngularVelocity;
+		T4f curInvAngle = -iterDt * curAngularVelocity;
+		T4f prevInvAngle = -iterDt * prevAngularVelocity;
 
 		physx::PxQuat curInvRotation = exp(castToPxVec3(curInvAngle));
 		physx::PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle));
@@ -312,17 +312,17 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
 
 		assign(result.mRotationMatrix, curMatrix);
 
-		Simd4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent);
-		Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia));
-		Simd4f angularInertia = load(array(cloth.mAngularInertia));
-		Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity;
+		T4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent);
+		T4f centrifugalInertia = load(array(cloth.mCentrifugalInertia));
+		T4f angularInertia = load(array(cloth.mAngularInertia));
+		T4f angularAcceleration = curAngularVelocity - prevAngularVelocity;
 
-		Simd4f epsilon = simd4f(sqrtf(FLT_MIN)); // requirement: sqr(epsilon) > 0
-		Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon;
-		Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon;
-		Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon;
-		Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon;
-		Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon;
+		T4f epsilon = simd4f(sqrtf(FLT_MIN)); // requirement: sqr(epsilon) > 0
+		T4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon;
+		T4f dragLengthSqr = lengthSqr(T4f(curAngularVelocity * angularDrag)) + epsilon;
+		T4f centrifugalLengthSqr = lengthSqr(T4f(curAngularVelocity * centrifugalInertia)) + epsilon;
+		T4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon;
+		T4f inertiaLengthSqr = lengthSqr(T4f(angularAcceleration * angularInertia)) + epsilon;
 
 		float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0];
 		float inertiaScale =
@@ -337,11 +337,11 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
 		    inertiaScale;
 
 		// slightly better in ClothCustomFloating than curInvAngle alone
-		Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f);
-		const Simd4f data = lengthSqr(centrifugalVelocity);
+		T4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f);
+		const T4f data = lengthSqr(centrifugalVelocity);
 		float centrifugalSqrLength = array(data)[0] * centrifugalScale;
 
-		Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale);
+		T4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale);
 		physx::PxMat33 coriolisMatrix = physx::shdfnd::star(castToPxVec3(coriolisVelocity));
 
 		const float* dampScalePtr = array(firstDampScale);
@@ -369,7 +369,7 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
 	}
 	else
 	{
-		Simd4f minusOne = -static_cast<Simd4f>(gSimd4fOne);
+		T4f minusOne = -static_cast<T4f>(gSimd4fOne);
 		result.mRotationMatrix[0] = minusOne;
 		result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne);
 	}
@@ -380,8 +380,8 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const
 	return result;
 }
 
-template <typename Simd4f>
-void cloth::IterationState<Simd4f>::update()
+template <typename T4f>
+void cloth::IterationState<T4f>::update()
 {
 	if (mIsTurning)
 	{
diff --git a/NvCloth/src/NvSimd/NvSimdTypes.h b/NvCloth/src/NvSimd/NvSimdTypes.h
index 0625332..dd94b40 100644
--- a/NvCloth/src/NvSimd/NvSimdTypes.h
+++ b/NvCloth/src/NvSimd/NvSimdTypes.h
@@ -104,7 +104,7 @@ void foo(const float* ptr)
 #define NV_SIMD_INLINE_ASSEMBLER 1
 #endif
 
-/*! \def NV_SIMD_USE_NAMESPACE
+/*! \def NV_CLOTH_NO_SIMD_NAMESPACE
 * \brief Set to 1 to define the SIMD library types and functions inside the nvidia::simd namespace.
 * By default, the types and functions defined in this header live in the global namespace.
 * This is because MSVC (prior to version 12, Visual Studio 2013) does an inferior job at optimizing
@@ -116,11 +116,11 @@ void foo(const float* ptr)
 * __m128i are wrapped into structs. Arguments need to be passed by reference in this mode.
 * \see NV_SIMD_VECTORCALL, Simd4fArg */
 
-#if defined NV_SIMD_USE_NAMESPACE&& NV_SIMD_USE_NAMESPACE
+#ifndef NV_CLOTH_NO_SIMD_NAMESPACE
 #define NV_SIMD_NAMESPACE_BEGIN                                                                                        \
 	namespace nv                                                                                                       \
 	{                                                                                                                  \
-	namespace simd                                                                                                     \
+	namespace cloth                                                                                                     \
 	{
 #define NV_SIMD_NAMESPACE_END                                                                                          \
 	}                                                                                                                  \
diff --git a/NvCloth/src/PointInterpolator.h b/NvCloth/src/PointInterpolator.h
index b9db131..75e1dcf 100644
--- a/NvCloth/src/PointInterpolator.h
+++ b/NvCloth/src/PointInterpolator.h
@@ -37,7 +37,7 @@ namespace cloth
 {
 
 // acts as a poor mans random access iterator
-template <typename Simd4f, typename BaseIterator>
+template <typename T4f, typename BaseIterator>
 class LerpIterator
 {
 
@@ -50,12 +50,12 @@ class LerpIterator
 	}
 
 	// return the interpolated point at a given index
-	inline Simd4f operator[](size_t index) const
+	inline T4f operator[](size_t index) const
 	{
 		return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha;
 	}
 
-	inline Simd4f operator*() const
+	inline T4f operator*() const
 	{
 		return (*this)[0];
 	}
@@ -70,13 +70,13 @@ class LerpIterator
 
   private:
 	// interpolation parameter
-	const Simd4f mAlpha;
+	const T4f mAlpha;
 
 	BaseIterator mStart;
 	BaseIterator mTarget;
 };
 
-template <typename Simd4f, size_t Stride>
+template <typename T4f, size_t Stride>
 class UnalignedIterator
 {
 
@@ -87,12 +87,12 @@ class UnalignedIterator
 	{
 	}
 
-	inline Simd4f operator[](size_t index) const
+	inline T4f operator[](size_t index) const
 	{
 		return load(mPointer + index * Stride);
 	}
 
-	inline Simd4f operator*() const
+	inline T4f operator*() const
 	{
 		return (*this)[0];
 	}
@@ -109,15 +109,15 @@ class UnalignedIterator
 };
 
 // acts as an iterator but returns a constant
-template <typename Simd4f>
+template <typename T4f>
 class ConstantIterator
 {
   public:
-	ConstantIterator(const Simd4f& value) : mValue(value)
+	ConstantIterator(const T4f& value) : mValue(value)
 	{
 	}
 
-	inline Simd4f operator*() const
+	inline T4f operator*() const
 	{
 		return mValue;
 	}
@@ -129,20 +129,20 @@ class ConstantIterator
 
   private:
 	ConstantIterator& operator = (const ConstantIterator&);
-	const Simd4f mValue;
+	const T4f mValue;
 };
 
 // wraps an iterator with constant scale and bias
-template <typename Simd4f, typename BaseIterator>
+template <typename T4f, typename BaseIterator>
 class ScaleBiasIterator
 {
   public:
-	ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias)
+	ScaleBiasIterator(BaseIterator base, const T4f& scale, const T4f& bias)
 	: mScale(scale), mBias(bias), mBaseIterator(base)
 	{
 	}
 
-	inline Simd4f operator*() const
+	inline T4f operator*() const
 	{
 		return (*mBaseIterator) * mScale + mBias;
 	}
@@ -156,8 +156,8 @@ class ScaleBiasIterator
   private:
 	ScaleBiasIterator& operator = (const ScaleBiasIterator&);
 
-	const Simd4f mScale;
-	const Simd4f mBias;
+	const T4f mScale;
+	const T4f mBias;
 
 	BaseIterator mBaseIterator;
 };
diff --git a/NvCloth/src/SwCollision.cpp b/NvCloth/src/SwCollision.cpp
index 89df8a5..0aa196d 100644
--- a/NvCloth/src/SwCollision.cpp
+++ b/NvCloth/src/SwCollision.cpp
@@ -40,6 +40,7 @@
 
 using namespace nv;
 using namespace physx;
+using namespace cloth;
 
 // the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision
 template <typename T4f>
@@ -160,31 +161,41 @@ void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres,
 	cloth::ConeData* cIt = dst;
 	for (const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt)
 	{
+		// w element contains sphere radii
 		PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]);
 		PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]);
 
 		PxVec4 center = (second + first) * 0.5f;
-		PxVec4 axis = (second - first) * 0.5f;
+		PxVec4 axis = (second - first) * 0.5f; //half axis
+		//axiw.w = half of radii difference
 
-		float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
-		float sqrConeLength = sqrAxisLength - cloth::sqr(axis.w);
+		// |Axis|^2
+		float sqrAxisHalfLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
 
-		float invAxisLength = 1 / sqrtf(sqrAxisLength);
-		float invConeLength = 1 / sqrtf(sqrConeLength);
+		// http://jwilson.coe.uga.edu/emt669/Student.Folders/Kertscher.Jeff/Essay.3/Tangents.html
+		// |Axis|^2 = |Cone|^2 + (sphere2Radius-sphere1Radius)^2
+		float sqrConeHalfLength = sqrAxisHalfLength - cloth::sqr(axis.w);
 
-		if (sqrConeLength <= 0.0f)
-			invAxisLength = invConeLength = 0.0f;
+		float invAxisHalfLength = 1 / sqrtf(sqrAxisHalfLength);
+		float invConeHalfLength = 1 / sqrtf(sqrConeHalfLength);
 
-		float axisLength = sqrAxisLength * invAxisLength;
-		float slope = axis.w * invConeLength;
+		if (sqrConeHalfLength <= 0.0f)
+			invAxisHalfLength = invConeHalfLength = 0.0f;
+
+		float axisHalfLength = sqrAxisHalfLength * invAxisHalfLength;
+		float slope = axis.w * invConeHalfLength;
 
 		cIt->center = PxVec3(center.x, center.y, center.z );
-		cIt->radius = (axis.w + first.w) * invConeLength * axisLength;
-		cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength;
+		cIt->radius = (axis.w + first.w) * invConeHalfLength * axisHalfLength; //cone radius in the center
+		cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisHalfLength;
 		cIt->slope = slope;
 
-		cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisLength);
-		cIt->halfLength = axisLength;
+		// cos()^2 = 1.0 - (radius difference / axis length)^2
+		// cos()^2 = 1.0 - (opposite/hypotenuse)^2
+		// cos()^2 = 1.0 - sin(angle between c2c1 and  c2t1)^2
+		// cos()^2 = 1.0 - sin(angle between axis and  c2t1)^2
+		cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisHalfLength);
+		cIt->halfLength = axisHalfLength;
 
 		uint32_t firstMask = 0x1u << iIt->first;
 		cIt->firstMask = firstMask;
@@ -407,12 +418,14 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt)
 {
 	static const int maxIndex = sGridSize - 1;
 
+	uint32_t mask = 0x1; //single bit mask for current sphere
 	const SphereData* sEnd = sIt + mClothData.mNumSpheres;
-	for (uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1)
+	for (; sIt != sEnd; ++sIt, mask <<= 1)
 	{
 		T4f sphere = loadAligned(array(sIt->center));
 		T4f radius = splat<3>(sphere);
 
+		//calculate the first and last cell index, for each axis, that contains the sphere
 		T4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero));
 		T4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
 
@@ -422,11 +435,14 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt)
 		uint32_t* firstIt = reinterpret_cast<uint32_t*>(mSphereGrid);
 		uint32_t* lastIt = firstIt + 3 * sGridSize;
 
+		//loop through the 3 axes 
 		for (uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize)
 		{
+			//mark the sphere and everything to the right
 			for (int j = firstIdx[i]; j <= maxIndex; ++j)
 				firstIt[j] |= mask;
 
+			//mark the sphere and everything to the left
 			for (int j = lastIdx[i]; j >= 0; --j)
 				lastIt[j] |= mask;
 		}
@@ -469,17 +485,23 @@ void cloth::SwCollision<T4f>::mergeAcceleration(uint32_t* firstIt)
 template <typename T4f>
 bool cloth::SwCollision<T4f>::buildAcceleration()
 {
-	// determine sphere bbox
+	// determine single bounding box around all spheres
 	BoundingBox<T4f> sphereBounds =
 	    expandBounds(emptyBounds<T4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres);
+
+	// determine single bounding box around all particles
 	BoundingBox<T4f> particleBounds = loadBounds<T4f>(mClothData.mCurBounds);
+
 	if (mClothData.mEnableContinuousCollision)
 	{
+		// extend bounds to include movement from previous frame
 		sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres);
 		particleBounds = expandBounds(particleBounds, loadBounds<T4f>(mClothData.mPrevBounds));
 	}
 
 	BoundingBox<T4f> bounds = intersectBounds(sphereBounds, particleBounds);
+
+	// no collision checks needed if the intersection between particle bounds and sphere bounds is empty
 	T4f edgeLength = (bounds.mUpper - bounds.mLower) & ~static_cast<T4f>(sMaskW);
 	if (!allGreaterEqual(edgeLength, gSimd4fZero))
 		return false;
@@ -490,6 +512,7 @@ bool cloth::SwCollision<T4f>::buildAcceleration()
 	const T4f expandedEdgeLength = max(expandedUpper - expandedLower, gSimd4fEpsilon);
 
 	// make grid minimal thickness and strict upper bound of spheres
+	// grid maps bounds to 0-7 space (sGridLength =~= 8)
 	mGridScale = sGridLength * recip<1>(expandedEdgeLength);
 	mGridBias = -expandedLower * mGridScale;
 	array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles()
@@ -655,8 +678,8 @@ struct cloth::SwCollision<T4f>::ImpulseAccumulator
 		mNumCollisions = mNumCollisions + (gSimd4fOne & mask);
 	}
 
-	T4f mDeltaX, mDeltaY, mDeltaZ;
-	T4f mVelX, mVelY, mVelZ;
+	T4f mDeltaX, mDeltaY, mDeltaZ; //depenetration delta
+	T4f mVelX, mVelY, mVelZ; //frame offset of the collision shape (velocity * dt)
 	T4f mNumCollisions;
 };
 
@@ -684,12 +707,15 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask,
 
 		T4f sqrDistance = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
 		T4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * splat<3>(sphere);
+		// negativeScale = 1 - radius/|position-sphere|
 
 		T4f contactMask;
 		if (!anyGreater(gSimd4fZero, negativeScale, contactMask))
 			continue;
 
 		accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask);
+		// -= delta * negativeScale
+		//  = delta - delta * radius/|position-sphere|
 
 		if (frictionEnabled)
 		{
@@ -730,10 +756,13 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
 
 		T4f center = loadAligned(centerPtr, offset);
 
+		// offset from center of cone to particle
+		// delta = pos - center
 		T4f deltaX = positions[0] - splat<0>(center);
 		T4f deltaY = positions[1] - splat<1>(center);
 		T4f deltaZ = positions[2] - splat<2>(center);
 
+		//axis of the cone
 		T4f axis = loadAligned(axisPtr, offset);
 
 		T4f axisX = splat<0>(axis);
@@ -741,12 +770,16 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
 		T4f axisZ = splat<2>(axis);
 		T4f slope = splat<3>(axis);
 
+		// project delta onto axis
 		T4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+		// interpolate radius
 		T4f radius = dot * slope + splat<3>(center);
 
 		// set radius to zero if cone is culled
 		radius = max(radius, gSimd4fZero) & ~culled;
 
+		// distance to axis
+		// sqrDistance = |delta|^2 - |dot|^2
 		T4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
 
 		T4i auxiliary = loadAligned(auxiliaryPtr, offset);
@@ -765,6 +798,8 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
 		sqrDistance = max(sqrDistance, gSimd4fEpsilon);
 
 		T4f invDistance = rsqrt(sqrDistance);
+
+		//offset base to take slope in to account
 		T4f base = dot + slope * sqrDistance * invDistance;
 
 		// force left/rightMask to false if not inside cone
@@ -780,6 +815,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc
 		shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask);
 		shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask);
 
+		//contact normal direction
 		deltaX = deltaX - base * axisX;
 		deltaY = deltaY - base * axisY;
 		deltaZ = deltaZ - base * axisZ;
@@ -1173,7 +1209,9 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
 	T4f ny = deltaY * rcpDelta;
 	T4f nz = deltaZ * rcpDelta;
 
-	// calculate relative velocity scaled by number of collisions
+	// calculate relative velocity
+	// velXYZ is scaled by one over the number of collisions since all collisions accumulate into 
+	//  that variable during collision detection
 	T4f rvx = curPos[0] - prevPos[0] - velX * scale;
 	T4f rvy = curPos[1] - prevPos[1] - velY * scale;
 	T4f rvz = curPos[2] - prevPos[2] - velZ * scale;
@@ -1186,7 +1224,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co
 	T4f rvty = rvy - rvn * ny;
 	T4f rvtz = rvz - rvn * nz;
 
-	// calculate magnitude of vt
+	// calculate magnitude of relative tangential velocity
 	T4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon);
 
 	// magnitude of friction impulse (cannot be greater than -vt)
@@ -1206,7 +1244,7 @@ void cloth::SwCollision<T4f>::collideParticles()
 	const T4f massScale = simd4f(mClothData.mCollisionMassScale);
 
 	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
-	const T4f frictionScale = simd4f(mClothData.mFrictionScale);
+	const T4f frictionScale = simd4f(mClothData.mFrictionScale); //[arameter set by user
 
 	T4f curPos[4];
 	T4f prevPos[4];
@@ -1214,16 +1252,20 @@ void cloth::SwCollision<T4f>::collideParticles()
 	float* __restrict prevIt = mClothData.mPrevParticles;
 	float* __restrict pIt = mClothData.mCurParticles;
 	float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+	//loop over particles 4 at a time
 	for (; pIt < pEnd; pIt += 16, prevIt += 16)
 	{
 		curPos[0] = loadAligned(pIt, 0);
 		curPos[1] = loadAligned(pIt, 16);
 		curPos[2] = loadAligned(pIt, 32);
 		curPos[3] = loadAligned(pIt, 48);
-		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]); //group values by axis in simd structure
 
 		ImpulseAccumulator accum;
+
+		//first collide cones
 		T4i sphereMask = collideCones(curPos, accum);
+		//pass on hit mask to ignore sphere parts that are inside the cones
 		collideSpheres(sphereMask, curPos, accum);
 
 		T4f mask;
@@ -1267,6 +1309,7 @@ void cloth::SwCollision<T4f>::collideParticles()
 			curPos[3] = select(mask, curPos[3] * scale, curPos[3]);
 		}
 
+		//apply average de-penetration delta
 		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
 		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
 		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp
index b9b494f..50be414 100644
--- a/NvCloth/src/SwInterCollision.cpp
+++ b/NvCloth/src/SwInterCollision.cpp
@@ -39,6 +39,7 @@
 
 using namespace nv;
 using namespace physx;
+using namespace cloth;
 
 namespace
 {
diff --git a/NvCloth/src/SwSelfCollision.cpp b/NvCloth/src/SwSelfCollision.cpp
index 095943d..ec5a166 100644
--- a/NvCloth/src/SwSelfCollision.cpp
+++ b/NvCloth/src/SwSelfCollision.cpp
@@ -37,23 +37,24 @@
 #endif
 
 using namespace nv;
+using namespace cloth;
 
 namespace
 {
 
-const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
-
 // returns sorted indices, output needs to be at least 2*(last - first) + 1024
 void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
 {
+	// this sort uses a radix (bin) size of 256, requiring 4 bins to sort the 32 bit keys
 	uint16_t n = uint16_t(last - first);
 
 	uint16_t* buffer = out + 2 * n;
 	uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
 
+	//zero the buffer memory used for the 4 buckets
 	memset(buffer, 0, 1024 * sizeof(uint16_t));
 
-	// build 3 histograms in one pass
+	// build 4 histograms in one pass
 	for (const uint32_t* __restrict it = first; it != last; ++it)
 	{
 		uint32_t key = *it;
@@ -64,7 +65,7 @@ void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
 	}
 
 	// convert histograms to offset tables in-place
-	uint16_t sums[4] = {};
+	uint16_t sums[4] = {0, 0, 0, 0};
 	for (uint32_t i = 0; i < 256; ++i)
 	{
 		uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]);
@@ -133,6 +134,7 @@ bool isSelfCollisionEnabled(const cloth::SwCloth& cloth)
 	return std::min(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f;
 }
 
+// align x to a 2 byte boundary
 inline uint32_t align2(uint32_t x)
 {
 	return (x + 1) & ~1;
@@ -146,7 +148,7 @@ cloth::SwSelfCollision<T4f>::SwSelfCollision(cloth::SwClothData& clothData, clot
 {
 	mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance);
 	mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
-	mStiffness = sMaskXYZ & static_cast<T4f>(simd4f(mClothData.mSelfCollisionStiffness));
+	mStiffness = gSimd4fMaskXYZ & static_cast<T4f>(simd4f(mClothData.mSelfCollisionStiffness));
 }
 
 template <typename T4f>
@@ -170,11 +172,12 @@ void cloth::SwSelfCollision<T4f>::operator()()
 	uint32_t hashAxis0 = (sweepAxis + 1) % 3;
 	uint32_t hashAxis1 = (sweepAxis + 2) % 3;
 
-	// reserve 0, 127, and 65535 for sentinel
+	// reserve 0, 255, and 65535 for sentinel
 	T4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
 	array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
 
 	T4f one = gSimd4fOne;
+	// +1 for sentinel 0 offset
 	T4f gridSize = simd4f(254.0f);
 	array(gridSize)[sweepAxis] = 65534.0f;
 
@@ -194,6 +197,7 @@ void cloth::SwSelfCollision<T4f>::operator()()
 	// create keys
 	for (uint32_t i = 0; i < numIndices; ++i)
 	{
+		// use all particles when no self collision indices are set
 		uint32_t index = indices ? indices[i] : i;
 
 		// grid coordinate
@@ -207,28 +211,32 @@ void cloth::SwSelfCollision<T4f>::operator()()
 		keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
 	}
 
-	// compute sorted keys indices
+	// compute sorted key indices
 	radixSort(keys, keys + numIndices, sortedIndices);
 
 	// snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
-	uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769];
+	// sortedIndices[2 * numIndices + 768 + 1] is actually histograms[3]+1 from radixSort
+	uint16_t firstColumnSize = sortedIndices[2 * numIndices + 768 + 1];
 
-	// sort keys
+	// sort keys using the sortedIndices
 	for (uint32_t i = 0; i < numIndices; ++i)
 		sortedKeys[i] = keys[sortedIndices[i]];
 	sortedKeys[numIndices] = uint32_t(-1); // sentinel
 
+	// do user provided index array indirection here if we have one
+	//  so we don't need to keep branching for this condition later
 	if (indices)
 	{
 		// sort indices (into no-longer-needed keys array)
-		const uint16_t* __restrict permutation = sortedIndices;
+		// the keys array is no longer used so we can reuse it to store indices[sortedIndices[i]]
+		const uint16_t* __restrict oldSortedIndices = sortedIndices;
 		sortedIndices = reinterpret_cast<uint16_t*>(keys);
 		for (uint32_t i = 0; i < numIndices; ++i)
-			sortedIndices[i] = uint16_t(indices[permutation[i]]);
+			sortedIndices[i] = uint16_t(indices[oldSortedIndices[i]]);
 	}
 
 	// calculate the number of buckets we need to search forward
-	const Simd4i data = intFloor(gridScale * mCollisionDistance);
+	const Simd4i data = intFloor(gridScale * mCollisionDistance); //equal to or larger than floor(mCollisionDistance)
 	uint32_t collisionDistance = 2 + static_cast<uint32_t>(array(data)[sweepAxis]);
 
 	// collide particles
@@ -310,7 +318,7 @@ void cloth::SwSelfCollision<T4f>::collideParticles(T4f& pos0, T4f& pos1, const T
 
 	T4f ratio = mCollisionDistance * rsqrt(distSqr);
 	T4f scale = mStiffness * recip(gSimd4fEpsilon + w0 + w1);
-	T4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+	T4f delta = (scale * (diff - diff * ratio)) & gSimd4fMaskXYZ;
 
 	pos0 = pos0 + delta * w0;
 	pos1 = pos1 - delta * w1;
@@ -325,42 +333,71 @@ template <bool useRestParticles>
 void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize,
                                                       const uint16_t* indices, uint32_t collisionDistance)
 {
+	//keys is an array of bucket keys for the particles
+	//indices is an array of particle indices
+	//collisionDistance is the number of buckets along the sweep axis we need to search after the current one
+
 	T4f* __restrict particles = reinterpret_cast<T4f*>(mClothData.mCurParticles);
 	T4f* __restrict restParticles =
 	    useRestParticles ? reinterpret_cast<T4f*>(mClothData.mRestPositions) : particles;
 
-	const uint32_t bucketMask = uint16_t(-1);
+	//16 lsb's are for the bucket
+	const uint32_t bucketMask = 0x0000ffff;
 
+	// offsets for cells (not along the sweep axis)
+	//										[1]		[3]-[1]			[3]		  [1]+[3]
 	const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
 
 	const uint32_t* __restrict kFirst[5];
 	const uint32_t* __restrict kLast[5];
 
+	/*
+	We use 5 first/last pairs to search the following cells
+	=====================
+	|   |   |   |   |   |
+	=====================
+	|   |   | 0 | 1 |   |
+	=====================
+	|   | 2 | 3 | 4 |   |
+	=====================
+	|   |   |   |   |   |
+	=====================
+	With 0 as the origin.
+	This way collisions won't be double reported.
+	*/
+
 	{
 		// optimization: scan forward iterator starting points once instead of 9 times
 		const uint32_t* __restrict kIt = keys;
 
 		uint32_t key = *kIt;
+		//clamp first/lastKey to bucket
 		uint32_t firstKey = key - std::min(collisionDistance, key & bucketMask);
 		uint32_t lastKey = std::min(key + collisionDistance, key | bucketMask);
 
+		//sweep 0
 		kFirst[0] = kIt;
+		//find next key in keys that is past lastKey
 		while (*kIt < lastKey)
 			++kIt;
 		kLast[0] = kIt;
 
+		//sweep 1...4
 		for (uint32_t k = 1; k < 5; ++k)
 		{
+			// scan forward start point
 			for (uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
 				++kIt;
 			kFirst[k] = kIt;
 
+			// scan forward end point
 			for (uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
 				++kIt;
 			kLast[k] = kIt;
 
-			// jump forward once to second column
-			kIt = keys + firstColumnSize;
+			// jump forward once to second column to go from cell offset 1 to 2 quickly
+			if(firstColumnSize)
+				kIt = keys + firstColumnSize;
 			firstColumnSize = 0;
 		}
 	}
@@ -371,7 +408,8 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_
 	const uint16_t* __restrict jIt;
 	const uint16_t* __restrict jEnd;
 
-	for (; iIt != iEnd; ++iIt, ++kFirst[0])
+	//loop through all indices
+	for (; iIt < iEnd; ++iIt, ++kFirst[0])
 	{
 		NV_CLOTH_ASSERT(*iIt < mClothData.mNumParticles);
 
@@ -390,8 +428,8 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_
 			++kLast[0];
 
 		// process potential colliders of same cell
-		jEnd = indices + (kLast[0] - keys);
-		for (jIt = iIt + 1; jIt != jEnd; ++jIt)
+		jEnd = indices + (kLast[0] - keys); //calculate index from key pointer
+		for (jIt = iIt + 1; jIt < jEnd; ++jIt)
 			collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
 
 		// process neighbor cells
@@ -407,7 +445,7 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_
 
 			// process potential colliders
 			jEnd = indices + (kLast[k] - keys);
-			for (jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+			for (jIt = indices + (kFirst[k] - keys); jIt < jEnd; ++jIt)
 				collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
 		}
 
diff --git a/NvCloth/src/SwSolver.cpp b/NvCloth/src/SwSolver.cpp
index c7437e1..f0f9152 100644
--- a/NvCloth/src/SwSolver.cpp
+++ b/NvCloth/src/SwSolver.cpp
@@ -50,7 +50,7 @@ bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, Iteratio
 }
 
 using namespace nv;
-
+using namespace cloth;
 #if NV_SIMD_SIMD
 typedef Simd4f Simd4fType;
 #else
@@ -93,12 +93,17 @@ void sortTasks(shdfnd::Array<T, cloth::NonTrackingAllocator>& tasks)
 
 void cloth::SwSolver::addCloth(Cloth* cloth)
 {
-	SwCloth& swCloth = *static_cast<SwCloth*>(cloth);
-
-	mSimulatedCloths.pushBack(SimulatedCloth(swCloth, this));
+	addClothAppend(cloth);
 	sortTasks(mSimulatedCloths);
+}
 
-	mCloths.pushBack(&swCloth);
+void cloth::SwSolver::addCloths(Range<Cloth*> cloths)
+{
+	for (uint32_t i = 0; i < cloths.size(); ++i)
+	{
+		addClothAppend(*(cloths.begin() + i));
+	}
+	sortTasks(mSimulatedCloths);
 }
 
 void cloth::SwSolver::removeCloth(Cloth* cloth)
@@ -221,6 +226,16 @@ void cloth::SwSolver::interCollision()
 	collider();
 }
 
+void cloth::SwSolver::addClothAppend(Cloth* cloth)
+{
+	SwCloth& swCloth = *static_cast<SwCloth*>(cloth);
+	NV_CLOTH_ASSERT(mCloths.find(&swCloth) == mCloths.end());
+
+	mSimulatedCloths.pushBack(SimulatedCloth(swCloth, this));
+
+	mCloths.pushBack(&swCloth);
+}
+
 void cloth::SwSolver::beginFrame() const
 {
 	mSimulateProfileEventData = NV_CLOTH_PROFILE_START_CROSSTHREAD("cloth::SwSolver::simulate", 0);
@@ -287,9 +302,14 @@ void cloth::SwSolver::SimulatedCloth::Simulate()
 
 	// construct kernel functor and execute
 #if NV_ANDROID
-	// if (!neonSolverKernel(cloth, data, allocator, factory))
-#endif
+	if (!neonSolverKernel(*mCloth, data, allocator, factory))
+	{
+		//NV_CLOTH_LOG_WARNING("No NEON CPU support detected. Falling back to scalar types.");
+		SwSolverKernel<Scalar4f>(*mCloth, data, allocator, factory)();
+	}
+#else
 	SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory)();
+#endif
 
 	data.reconcile(*mCloth); // update cloth
 }
diff --git a/NvCloth/src/SwSolver.h b/NvCloth/src/SwSolver.h
index c7b177b..ad58a7c 100644
--- a/NvCloth/src/SwSolver.h
+++ b/NvCloth/src/SwSolver.h
@@ -64,6 +64,7 @@ class SwSolver : public Solver
 	virtual ~SwSolver();
 
 	virtual void addCloth(Cloth*) override;
+	virtual void addCloths(Range<Cloth*> cloths) override;
 	virtual void removeCloth(Cloth*) override;
 	virtual int getNumCloths() const override;
 	virtual Cloth * const * getClothList() const override;
@@ -112,6 +113,10 @@ class SwSolver : public Solver
 	}
 
   private:
+	// add cloth helper functions
+	void addClothAppend(Cloth* cloth);
+
+	// simulate helper functions
 	void beginFrame() const;
 	void endFrame() const;
 
diff --git a/NvCloth/src/SwSolverKernel.cpp b/NvCloth/src/SwSolverKernel.cpp
index eec7956..2181b1e 100644
--- a/NvCloth/src/SwSolverKernel.cpp
+++ b/NvCloth/src/SwSolverKernel.cpp
@@ -103,6 +103,7 @@ const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA
 #endif
 
 using namespace nv;
+using namespace cloth;
 
 namespace
 {
@@ -209,11 +210,14 @@ void constrainMotion(T4f* __restrict curIt, const T4f* __restrict curEnd, const
 		T4f isPositive;
 		if (anyGreater(slack, gSimd4fZero, isPositive))
 		{
-			// set invMass to zero if radius is zero
+			// set invMass to zero if radius is zero (xyz will be unchanged)
+			// curPos.w = radius > 0 ? curPos.w : 0
+			// the first three components are compared against -FLT_MAX which is always true
 			curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ);
 			curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ);
 			curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ);
 			curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ);
+			// we don't have to splat the last one as the 4th element is already in the right place
 
 			slack = slack * stiffness & isPositive;
 
@@ -367,7 +371,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons
 	}
 }
 
-#if PX_WINDOWS_FAMILY
+#if PX_WINDOWS_FAMILY && NV_SIMD_SSE2
 #include "sse2/SwSolveConstraints.h"
 #endif
 
diff --git a/NvCloth/src/cuda/CuCloth.cpp b/NvCloth/src/cuda/CuCloth.cpp
index 3e6175b..4131b04 100644
--- a/NvCloth/src/cuda/CuCloth.cpp
+++ b/NvCloth/src/cuda/CuCloth.cpp
@@ -423,7 +423,7 @@ void CuCloth::clearParticleAccelerations()
 {
 	CuContextLock contextLock(mFactory);
 	CuDeviceVector<PxVec4>(mFactory.mContext).swap(mParticleAccelerations);
-	CuHostVector<PxVec4, CU_MEMHOSTALLOC_DEVICEMAP>::Type().swap(mParticleAccelerationsHostCopy);
+	CuHostVector<PxVec4, CU_MEMHOSTALLOC_DEVICEMAP>::Type(mFactory.mContext).swap(mParticleAccelerationsHostCopy);
 	wakeUp();
 }
 
diff --git a/NvCloth/src/cuda/CuClothData.h b/NvCloth/src/cuda/CuClothData.h
index dd836fd..5f2d7c6 100644
--- a/NvCloth/src/cuda/CuClothData.h
+++ b/NvCloth/src/cuda/CuClothData.h
@@ -32,7 +32,7 @@
 #include <foundation/Px.h>
 
 #ifndef __CUDACC__
-#include "simd.h"
+#include "Simd.h"
 #endif
 
 namespace nv
diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp
index 9bc20db..957f912 100644
--- a/NvCloth/src/cuda/CuFabric.cpp
+++ b/NvCloth/src/cuda/CuFabric.cpp
@@ -31,6 +31,7 @@
 #include "CuContextLock.h"
 #include "CuFactory.h"
 #include <PsUtilities.h>
+#include <limits.h>
 
 using namespace physx;
 
diff --git a/NvCloth/src/cuda/CuPinnedAllocator.h b/NvCloth/src/cuda/CuPinnedAllocator.h
index 8b1787b..9939324 100644
--- a/NvCloth/src/cuda/CuPinnedAllocator.h
+++ b/NvCloth/src/cuda/CuPinnedAllocator.h
@@ -29,6 +29,8 @@
 
 #pragma once
 
+#include <utility>
+
 #include "CuCheckSuccess.h"
 #include "NvCloth/Allocator.h"
 
@@ -102,7 +104,7 @@ public:
 
 	void destroy(T* ptr)
 	{
-		core::unused(ptr);
+		PX_UNUSED(ptr);
 		ptr->~T();
 	}
 
@@ -122,13 +124,13 @@ bool operator!=(const CuHostAllocator<T1, Flag1>&, const CuHostAllocator<T2, Fla
 	return false;
 }
 
-//Use CuHostVectorImpl instead of physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>> 
+//Use CuHostVectorImpl instead of physx::shdfnd::Array<T, CuHostAllocator<T, Flags>> 
 //This entire class is just to make sure that the mDevicePtr from the CuHostAllocator is properly swapped together with mData
 template <typename T, unsigned Flags = 0>
-class CuHostVectorImpl : public physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>>
+class CuHostVectorImpl : public physx::shdfnd::Array<T, CuHostAllocator<T, Flags>>
 {
-	typedef physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>> Super;
-	typedef typename CuHostAllocator<T, Flags> Alloc;
+	typedef physx::shdfnd::Array<T, CuHostAllocator<T, Flags>> Super;
+	typedef CuHostAllocator<T, Flags> Alloc;
 public:
 	explicit CuHostVectorImpl(const physx::PxEMPTY v):Super(v){}
 	PX_INLINE explicit CuHostVectorImpl(const Alloc& alloc = Alloc()):Super(alloc){}
@@ -142,10 +144,10 @@ public:
 
 	PX_INLINE explicit CuHostVectorImpl(const T* first, const T* last, const Alloc& alloc = Alloc()):Super(first,last,alloc){}
 
-	void swap(physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>>& other)
+	void swap(CuHostVectorImpl<T, Flags>& other)
 	{
-		PX_ASSERT(mContext == other.mContext);
-		physx::shdfnd::swap(mDevicePtr, other.mDevicePtr);
+		NV_CLOTH_ASSERT(this->mContext == other.mContext);
+		physx::shdfnd::swap(this->mDevicePtr, other.mDevicePtr);
 		Super::swap(other);
 	}
 };
diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp
index f0e328f..7ef1d32 100644
--- a/NvCloth/src/cuda/CuSolver.cpp
+++ b/NvCloth/src/cuda/CuSolver.cpp
@@ -302,6 +302,28 @@ cloth::CuSolver::~CuSolver()
 	mFactory.mSolverCount--;
 }
 
+void cloth::CuSolver::addClothAppend(Cloth* cloth)
+{
+	CuCloth& cuCloth = *static_cast<CuCloth*>(cloth);
+
+	NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end());
+
+	mCloths.pushBack(&cuCloth);
+	// trigger update of mClothData array
+	cuCloth.notifyChanged();
+}
+
+void cloth::CuSolver::addClothUpdateData()
+{
+	CuContextLock contextLock(mFactory);
+
+	// resize containers and update kernel data
+	mClothDataHostCopy.resize(mCloths.size());
+	mClothData.resize(mCloths.size());
+	mFrameData.resize(mCloths.size());
+	updateKernelData();
+}
+
 void cloth::CuSolver::updateKernelData()
 {
 	mKernelDataHost.mClothIndex = mClothIndex.get();
@@ -326,24 +348,17 @@ struct ClothSimCostGreater
 
 void cloth::CuSolver::addCloth(Cloth* cloth)
 {
-	CuCloth& cuCloth = *static_cast<CuCloth*>(cloth);
-
-	NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end());
-
-	mCloths.pushBack(&cuCloth);
-	// trigger update of mClothData array
-	cuCloth.notifyChanged();
-
-	// sort cloth instances by size
-	shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator());
-
-	CuContextLock contextLock(mFactory);
+	addClothAppend(cloth);
+	addClothUpdateData();
+}
 
-	// resize containers and update kernel data
-	mClothDataHostCopy.resize(mCloths.size());
-	mClothData.resize(mCloths.size());
-	mFrameData.resize(mCloths.size());
-	updateKernelData();
+void cloth::CuSolver::addCloths(Range<Cloth*> cloths)
+{
+	for (uint32_t i = 0; i < cloths.size(); ++i)
+	{
+		addClothAppend(*(cloths.begin() + i));
+	}
+	addClothUpdateData();
 }
 
 void cloth::CuSolver::removeCloth(Cloth* cloth)
@@ -401,7 +416,8 @@ void cloth::CuSolver::endSimulation()
 
 int cloth::CuSolver::getSimulationChunkCount() const
 {
-	return 1;
+	// 0 chunks when no cloth present in the solver, 1 otherwise
+	return getNumCloths() != 0;
 }
 
 void cloth::CuSolver::beginFrame()
diff --git a/NvCloth/src/cuda/CuSolver.h b/NvCloth/src/cuda/CuSolver.h
index 0406e00..b4c6d6b 100644
--- a/NvCloth/src/cuda/CuSolver.h
+++ b/NvCloth/src/cuda/CuSolver.h
@@ -58,6 +58,7 @@ public:
 	~CuSolver();
 
 	virtual void addCloth(Cloth*) override;
+	virtual void addCloths(Range<Cloth*> cloths) override;
 	virtual void removeCloth(Cloth*) override;
 	virtual int getNumCloths() const override;
 	virtual Cloth * const * getClothList() const override;
@@ -103,6 +104,10 @@ public:
 	}
 
   private:
+	// add cloth helper functions
+	void addClothAppend(Cloth* cloth);
+	void addClothUpdateData();
+
 	void updateKernelData(); // context needs to be acquired
 
 	// simulate helper functions
diff --git a/NvCloth/src/dx/DxBatchedVector.h b/NvCloth/src/dx/DxBatchedVector.h
index 2c5e313..76b9b22 100644
--- a/NvCloth/src/dx/DxBatchedVector.h
+++ b/NvCloth/src/dx/DxBatchedVector.h
@@ -297,7 +297,7 @@ class DxBatchedVector
 
 	void swap(DxBatchedVector<T>& other)
 	{
-		PX_ASSERT(&mStorage == &other.mStorage);
+		NV_CLOTH_ASSERT(&mStorage == &other.mStorage);
 		physx::shdfnd::swap(mOffset, other.mOffset);
 		physx::shdfnd::swap(mSize, other.mSize);
 		physx::shdfnd::swap(mCapacity, other.mCapacity);
diff --git a/NvCloth/src/dx/DxClothData.h b/NvCloth/src/dx/DxClothData.h
index f91d37d..4da9be2 100644
--- a/NvCloth/src/dx/DxClothData.h
+++ b/NvCloth/src/dx/DxClothData.h
@@ -31,7 +31,7 @@
 #pragma once
 
 #include <foundation/Px.h>
-#include "simd.h"
+#include "Simd.h"
 
 namespace nv
 {
diff --git a/NvCloth/src/dx/DxFactory.cpp b/NvCloth/src/dx/DxFactory.cpp
index fbf0c51..91f5125 100644
--- a/NvCloth/src/dx/DxFactory.cpp
+++ b/NvCloth/src/dx/DxFactory.cpp
@@ -251,15 +251,15 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 	void cloth::DxFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
 		Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const
 	{
-		PX_ASSERT(&cloth.getFactory() == this);
+		NV_CLOTH_ASSERT(&cloth.getFactory() == this);
 
 		const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
 
-		PX_ASSERT(spheres.empty() || spheres.size() == dxCloth.mStartCollisionSpheres.size());
-		PX_ASSERT(capsules.empty() || capsules.size() == dxCloth.mCapsuleIndices.size() * 2);
-		PX_ASSERT(planes.empty() || planes.size() == dxCloth.mStartCollisionPlanes.size());
-		PX_ASSERT(convexes.empty() || convexes.size() == dxCloth.mConvexMasks.size());
-		PX_ASSERT(triangles.empty() || triangles.size() == dxCloth.mStartCollisionTriangles.size());
+		NV_CLOTH_ASSERT(spheres.empty() || spheres.size() == dxCloth.mStartCollisionSpheres.size());
+		NV_CLOTH_ASSERT(capsules.empty() || capsules.size() == dxCloth.mCapsuleIndices.size() * 2);
+		NV_CLOTH_ASSERT(planes.empty() || planes.size() == dxCloth.mStartCollisionPlanes.size());
+		NV_CLOTH_ASSERT(convexes.empty() || convexes.size() == dxCloth.mConvexMasks.size());
+		NV_CLOTH_ASSERT(triangles.empty() || triangles.size() == dxCloth.mStartCollisionTriangles.size());
 
 		// collision spheres are in pinned memory, so memcpy directly
 		if (!dxCloth.mStartCollisionSpheres.empty() && !spheres.empty())
@@ -296,13 +296,13 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 
 	void cloth::DxFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
 	{
-		PX_ASSERT(&cloth.getFactory() == this);
+		NV_CLOTH_ASSERT(&cloth.getFactory() == this);
 
 		const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
 
 		if (dxCloth.mMotionConstraints.mHostCopy.size())
 		{
-			PX_ASSERT(destConstraints.size() == dxCloth.mMotionConstraints.mHostCopy.size());
+			NV_CLOTH_ASSERT(destConstraints.size() == dxCloth.mMotionConstraints.mHostCopy.size());
 
 			memcpy(destConstraints.begin(), dxCloth.mMotionConstraints.mHostCopy.begin(),
 				sizeof(PxVec4) * dxCloth.mMotionConstraints.mHostCopy.size());
@@ -315,20 +315,20 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 				? dxCloth.mMotionConstraints.mTarget
 				: dxCloth.mMotionConstraints.mStart;
 
-			PX_ASSERT(destConstraints.size() == srcConstraints.size());
+			NV_CLOTH_ASSERT(destConstraints.size() == srcConstraints.size());
 			copyToHost(destConstraints.begin(), srcConstraints.buffer(), 0, destConstraints.size() * sizeof(PxVec4));
 		}
 	}
 
 	void cloth::DxFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
 	{
-		PX_ASSERT(&cloth.getFactory() == this);
+		NV_CLOTH_ASSERT(&cloth.getFactory() == this);
 
 		const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
 
 		if (dxCloth.mSeparationConstraints.mHostCopy.size())
 		{
-			PX_ASSERT(destConstraints.size() == dxCloth.mSeparationConstraints.mHostCopy.size());
+			NV_CLOTH_ASSERT(destConstraints.size() == dxCloth.mSeparationConstraints.mHostCopy.size());
 
 			memcpy(destConstraints.begin(), dxCloth.mSeparationConstraints.mHostCopy.begin(),
 				sizeof(PxVec4) * dxCloth.mSeparationConstraints.mHostCopy.size());
@@ -341,7 +341,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 				? dxCloth.mSeparationConstraints.mTarget
 				: dxCloth.mSeparationConstraints.mStart;
 
-			PX_ASSERT(destConstraints.size() == srcConstraints.size());
+			NV_CLOTH_ASSERT(destConstraints.size() == srcConstraints.size());
 
 			copyToHost(destConstraints.begin(), srcConstraints.buffer(), 0, destConstraints.size() * sizeof(PxVec4));
 		}
@@ -350,12 +350,12 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 	void cloth::DxFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const
 	{
 		/*
-		PX_ASSERT(&cloth.getFactory() == this);
+		NV_CLOTH_ASSERT(&cloth.getFactory() == this);
 		const DxCloth& dxCloth = static_cast<const DxClothImpl&>(cloth).mCloth;
 
 		if (dxCloth.mParticleAccelerationsHostCopy.size())
 		{
-			PX_ASSERT(dxCloth.mParticleAccelerationsHostCopy.size());
+			NV_CLOTH_ASSERT(dxCloth.mParticleAccelerationsHostCopy.size());
 
 			memcpy(destAccelerations.begin(), dxCloth.mParticleAccelerationsHostCopy.begin(),
 				sizeof(PxVec4) * dxCloth.mParticleAccelerationsHostCopy.size());
@@ -366,20 +366,20 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 
 			DxBatchedVector<PxVec4> const& srcAccelerations = dxCloth.mParticleAccelerations;
 
-			PX_ASSERT(destAccelerations.size() == srcAccelerations.size());
+			NV_CLOTH_ASSERT(destAccelerations.size() == srcAccelerations.size());
 
 			copyToHost(destAccelerations.begin(), srcAccelerations.buffer(), 0, destAccelerations.size() * sizeof(PxVec4));
 		}
 		*/
 		PX_UNUSED(&cloth);
 		PX_UNUSED(&destAccelerations);
-		PX_ASSERT(0);
+		NV_CLOTH_ASSERT(0);
 	}
 
 	void cloth::DxFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
 		Range<PxVec3> destWeights) const
 	{
-		PX_ASSERT(&cloth.getFactory() == this);
+		NV_CLOTH_ASSERT(&cloth.getFactory() == this);
 
 		DxContextLock contextLock(*this);
 
@@ -400,7 +400,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 			for (; srcIt != srcEnd; ++srcIt, ++destIt)
 				*destIt = reinterpret_cast<const PxVec3&>(*srcIt);
 
-			PX_ASSERT(destIt <= destWeights.end());
+			NV_CLOTH_ASSERT(destIt <= destWeights.end());
 		}
 
 		if (destIndices.size() > 0)
@@ -418,14 +418,14 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 			for (; srcIt != srcEnd; ++srcIt, ++destIt)
 				*destIt = Vec4u(*srcIt);
 
-			PX_ASSERT(&array(*destIt) <= destIndices.end());
+			NV_CLOTH_ASSERT(&array(*destIt) <= destIndices.end());
 		}
 	}
 
 	void cloth::DxFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const
 	{
 		const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
-		PX_ASSERT(destIndices.size() == dxCloth.mSelfCollisionIndices.size());
+		NV_CLOTH_ASSERT(destIndices.size() == dxCloth.mSelfCollisionIndices.size());
 		intrinsics::memCopy(destIndices.begin(), dxCloth.mSelfCollisionIndicesHost.begin(),
 			destIndices.size() * sizeof(uint32_t));
 	}
@@ -433,7 +433,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p
 	void cloth::DxFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const
 	{
 		const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth);
-		PX_ASSERT(destRestPositions.size() == dxCloth.mRestPositions.size());
+		NV_CLOTH_ASSERT(destRestPositions.size() == dxCloth.mRestPositions.size());
 		intrinsics::memCopy(destRestPositions.begin(), DxCloth::MappedVec4fVectorType(const_cast<DxCloth&>(dxCloth).mRestPositions).begin(),
 			destRestPositions.size() * sizeof(PxVec4));
 	}
diff --git a/NvCloth/src/dx/DxSolver.cpp b/NvCloth/src/dx/DxSolver.cpp
index ab030d5..66a8d8f 100644
--- a/NvCloth/src/dx/DxSolver.cpp
+++ b/NvCloth/src/dx/DxSolver.cpp
@@ -113,26 +113,9 @@ struct ClothSimCostGreater
 
 void cloth::DxSolver::addCloth(Cloth* cloth)
 {
-	DxCloth& dxCloth = static_cast<DxCloth&>(*cloth);
-
-	NV_CLOTH_ASSERT(mCloths.find(&dxCloth) == mCloths.end());
-
-	mCloths.pushBack(&dxCloth);
-	// trigger update of mClothData array
-	dxCloth.notifyChanged();
-
-	// sort cloth instances by size
-	shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator());
-
-	DxContextLock contextLock(mFactory);
-
-	// resize containers and update kernel data
-	mClothDataHostCopy.resize(mCloths.size());
-	mClothData.resize(mCloths.size());
-	mFrameDataHostCopy.resize(mCloths.size());
-
-	// lazy compilation of compute shader
-	mComputeError |= mFactory.mSolverKernelComputeShader == nullptr;
+	addClothAppend(cloth);
+	addClothUpdateData();
+	
 #if 0
 	if (!mSortComputeShader && !mComputeError)
 	{
@@ -197,7 +180,7 @@ void cloth::DxSolver::addCloth(Cloth* cloth)
 				{
 					uint32_t key = sortElems[i] & ~0xffff;
 					uint32_t keyRef = _SortElemsRef[i] & ~0xffff;
-					PX_ASSERT(key == keyRef);
+					NV_CLOTH_ASSERT(key == keyRef);
 				}
 				_SortElemsHostCopy.unmap();
 			}
@@ -208,6 +191,15 @@ void cloth::DxSolver::addCloth(Cloth* cloth)
 #endif
 }
 
+void cloth::DxSolver::addCloths(Range<Cloth*> cloths)
+{
+	for (uint32_t i = 0; i < cloths.size(); ++i)
+	{
+		addClothAppend(*(cloths.begin() + i));
+	}
+	addClothUpdateData();
+}
+
 void cloth::DxSolver::removeCloth(Cloth* cloth)
 {
 	DxCloth& dxCloth = static_cast<DxCloth&>(*cloth);
@@ -232,10 +224,9 @@ int cloth::DxSolver::getNumCloths() const
 }
 cloth::Cloth * const * cloth::DxSolver::getClothList() const
 {
-	if(getNumCloths())
+	if (getNumCloths() != 0)
 		return reinterpret_cast<Cloth* const*>(&mCloths[0]);
-	else
-		return nullptr;
+	return nullptr;
 }
 
 bool cloth::DxSolver::beginSimulation(float dt)
@@ -260,7 +251,34 @@ void cloth::DxSolver::endSimulation()
 }
 int cloth::DxSolver::getSimulationChunkCount() const
 {
-	return 1;
+	// 0 chunks when no cloth present in the solver, 1 otherwise
+	return getNumCloths() != 0;
+}
+
+void cloth::DxSolver::addClothAppend(Cloth* cloth)
+{
+	DxCloth& dxCloth = static_cast<DxCloth&>(*cloth);
+	NV_CLOTH_ASSERT(mCloths.find(&dxCloth) == mCloths.end());
+
+	mCloths.pushBack(&dxCloth);
+	// trigger update of mClothData array
+	dxCloth.notifyChanged();
+}
+
+void cloth::DxSolver::addClothUpdateData()
+{
+	// sort cloth instances by size
+	shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator());
+
+	DxContextLock contextLock(mFactory);
+
+	// resize containers and update kernel data
+	mClothDataHostCopy.resize(mCloths.size());
+	mClothData.resize(mCloths.size());
+	mFrameDataHostCopy.resize(mCloths.size());
+
+	// lazy compilation of compute shader
+	mComputeError |= mFactory.mSolverKernelComputeShader == nullptr;
 }
 
 void cloth::DxSolver::beginFrame()
diff --git a/NvCloth/src/dx/DxSolver.h b/NvCloth/src/dx/DxSolver.h
index 09f523a..07d77dc 100644
--- a/NvCloth/src/dx/DxSolver.h
+++ b/NvCloth/src/dx/DxSolver.h
@@ -56,6 +56,7 @@ class DxSolver : private DxContextLock, public Solver
 	~DxSolver();
 
 	virtual void addCloth(Cloth*) override;
+	virtual void addCloths(Range<Cloth*> cloths) override;
 	virtual void removeCloth(Cloth*) override;
 	virtual int getNumCloths() const override;
 	virtual Cloth * const * getClothList() const override;
@@ -101,6 +102,10 @@ class DxSolver : private DxContextLock, public Solver
 	}
 
   private:
+	// add cloth helper functions
+	void addClothAppend(Cloth* cloth);
+	void addClothUpdateData();
+
 	// simulate helper functions
 	void beginFrame();
 	void executeKernel();
diff --git a/NvCloth/src/neon/NeonSolverKernel.cpp b/NvCloth/src/neon/NeonSolverKernel.cpp
index 4d6de68..3e16b6f 100644
--- a/NvCloth/src/neon/NeonSolverKernel.cpp
+++ b/NvCloth/src/neon/NeonSolverKernel.cpp
@@ -35,15 +35,19 @@
 
 #include <cpu-features.h>
 
-namespace physx
+namespace
+{
+	const bool sNeonSupport = ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures();
+}
+
+namespace nv
 {
 namespace cloth
 {
 bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator,
-                      IterationStateFactory& factory, PxProfileZone* profileZone)
+                      IterationStateFactory& factory)
 {
-	return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() &&
-	       (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true);
+	return sNeonSupport && (SwSolverKernel<Simd4f>(cloth, data, allocator, factory)(), true);
 }
 }
 }
diff --git a/NvCloth/src/scalar/SwCollisionHelpers.h b/NvCloth/src/scalar/SwCollisionHelpers.h
index af21812..3ab756f 100644
--- a/NvCloth/src/scalar/SwCollisionHelpers.h
+++ b/NvCloth/src/scalar/SwCollisionHelpers.h
@@ -29,6 +29,8 @@
 
 #pragma once
 
+#include "PsMathUtils.h"
+
 namespace nv
 {
 namespace cloth
@@ -46,6 +48,7 @@ uint32_t findBitSet(uint32_t mask)
 
 inline Scalar4i intFloor(const Scalar4f& v)
 {
+	using physx::shdfnd::floor;
 	return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3])));
 }
 
diff --git a/NvCloth/src/sse2/SwCollisionHelpers.h b/NvCloth/src/sse2/SwCollisionHelpers.h
index c80ba1d..b759868 100644
--- a/NvCloth/src/sse2/SwCollisionHelpers.h
+++ b/NvCloth/src/sse2/SwCollisionHelpers.h
@@ -63,12 +63,15 @@ Simd4i intFloor(const Simd4f& v)
 {
 	Simd4i i = _mm_cvttps_epi32(v);
 	return _mm_sub_epi32(i, _mm_srli_epi32(simd4i(v), 31));
+	//Simd4i i = truncate(v);
+	//return i - (simd4i(v) >> 31);
 }
 
 Simd4i horizontalOr(const Simd4i& mask)
 {
 	Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y
 	return tmp | _mm_shuffle_epi32(tmp, 0x4e);         // w z y x -> y x w z
+//	return splat<0>(mask) | splat<1>(mask) | splat<2>(mask) | splat<3>(mask);
 }
 
 Gather<Simd4i>::Gather(const Simd4i& index)