From 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 Mon Sep 17 00:00:00 2001
From: git perforce import user <a@b>
Date: Tue, 25 Oct 2016 12:29:14 -0600
Subject: Initial commit: PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @
 21275617

[CL 21300167]
---
 PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp | 1935 ++++++++++++++++++++
 1 file changed, 1935 insertions(+)
 create mode 100644 PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp

(limited to 'PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp')
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp
new file mode 100644
index 00000000..e505289f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp
@@ -0,0 +1,1935 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "foundation/PxAssert.h"
+#include "SwCollision.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "IterationState.h"
+#include "BoundingBox.h"
+#include "PointInterpolator.h"
+#include "SwCollisionHelpers.h"
+#include <cstring> // for memset
+
+using namespace physx;
+
+// the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision
+template <typename Simd4f>
+const Simd4f cloth::SwCollision<Simd4f>::sSkeletonWidth = simd4f(cloth::sqr(1 - 0.2f) - 1);
+
+#if NV_SIMD_SSE2
+const Simd4i cloth::Gather<Simd4i>::sIntSignBit = simd4i(0x80000000);
+const Simd4i cloth::Gather<Simd4i>::sSignedMask = sIntSignBit | simd4i(0x7);
+#elif NV_SIMD_NEON
+const Simd4i cloth::Gather<Simd4i>::sPack = simd4i(0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c);
+const Simd4i cloth::Gather<Simd4i>::sOffset = simd4i(0x03020100);
+const Simd4i cloth::Gather<Simd4i>::sShift = simd4i(2);
+const Simd4i cloth::Gather<Simd4i>::sMask = simd4i(7);
+#endif
+
+namespace
+{
+const Simd4fTupleFactory sMaskX = simd4f(simd4i(~0, 0, 0, 0));
+const Simd4fTupleFactory sMaskZ = simd4f(simd4i(0, 0, ~0, 0));
+const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fTupleFactory gSimd4fOneXYZ = simd4f(1.0f, 1.0f, 1.0f, 0.0f);
+const Simd4fScalarFactory sGridLength = simd4f(8 - 1e-3f); // sGridSize
+const Simd4fScalarFactory sGridExpand = simd4f(1e-4f);
+const Simd4fTupleFactory sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+#if PX_PROFILE || PX_DEBUG
+template <typename Simd4f>
+uint32_t horizontalSum(const Simd4f& x)
+{
+	const float* p = array(x);
+	return uint32_t(0.5f + p[0] + p[1] + p[2] + p[3]);
+}
+#endif
+
+// 7 elements are written to ptr!
+template <typename Simd4f>
+void storeBounds(float* ptr, const cloth::BoundingBox<Simd4f>& bounds)
+{
+	store(ptr, bounds.mLower);
+	store(ptr + 3, bounds.mUpper);
+}
+}
+
+struct cloth::SphereData
+{
+	PxVec3 center;
+	float radius;
+};
+
+struct cloth::ConeData
+{
+	PxVec3 center;
+	float radius; // cone radius at center
+	PxVec3 axis;
+	float slope; // tan(alpha)
+
+	float sqrCosine; // cos^2(alpha)
+	float halfLength;
+
+	uint32_t firstMask;
+	uint32_t bothMask;
+};
+
+struct cloth::TriangleData
+{
+	PxVec3 base;
+	float edge0DotEdge1;
+
+	PxVec3 edge0;
+	float edge0SqrLength;
+
+	PxVec3 edge1;
+	float edge1SqrLength;
+
+	PxVec3 normal;
+	float padding;
+
+	float det;
+	float denom;
+
+	float edge0InvSqrLength;
+	float edge1InvSqrLength;
+};
+
+namespace physx
+{
+namespace cloth
+{
+template <typename Simd4f>
+BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bbox, const SphereData* sIt, const SphereData* sEnd)
+{
+	BoundingBox<Simd4f> result = bbox;
+	for(; sIt != sEnd; ++sIt)
+	{
+		Simd4f p = loadAligned(array(sIt->center));
+		Simd4f r = splat<3>(p);
+		result.mLower = min(result.mLower, p - r);
+		result.mUpper = max(result.mUpper, p + r);
+	}
+	return result;
+}
+}
+}
+
+namespace
+{
+template <typename Simd4f, typename SrcIterator>
+void generateSpheres(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+		*dIt = max(sMinusFloatMaxXYZ, *sIt); // clamp radius to 0
+}
+
+void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, const cloth::IndexPair* capsuleIndices,
+                   uint32_t numCones)
+{
+	cloth::ConeData* cIt = dst;
+	for(const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt)
+	{
+		PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]);
+		PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]);
+
+		PxVec4 center = (second + first) * 0.5f;
+		PxVec4 axis = (second - first) * 0.5f;
+
+		float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
+		float sqrConeLength = sqrAxisLength - cloth::sqr(axis.w);
+
+		float invAxisLength = 1 / sqrtf(sqrAxisLength);
+		float invConeLength = 1 / sqrtf(sqrConeLength);
+
+		if(sqrConeLength <= 0.0f)
+			invAxisLength = invConeLength = 0.0f;
+
+		float axisLength = sqrAxisLength * invAxisLength;
+		float slope = axis.w * invConeLength;
+
+		cIt->center = PxVec3(center.x, center.y, center.z);
+		cIt->radius = (axis.w + first.w) * invConeLength * axisLength;
+		cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength;
+		cIt->slope = slope;
+
+		cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisLength);
+		cIt->halfLength = axisLength;
+
+		uint32_t firstMask = 0x1u << iIt->first;
+		cIt->firstMask = firstMask;
+		cIt->bothMask = firstMask | 0x1u << iIt->second;
+	}
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generatePlanes(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+		*dIt = *sIt;
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generateTriangles(cloth::TriangleData* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++dIt)
+	{
+		Simd4f p0 = *sIt;
+		++sIt;
+		Simd4f p1 = *sIt;
+		++sIt;
+		Simd4f p2 = *sIt;
+		++sIt;
+
+		Simd4f edge0 = p1 - p0;
+		Simd4f edge1 = p2 - p0;
+		Simd4f normal = cross3(edge0, edge1);
+
+		Simd4f edge0SqrLength = dot3(edge0, edge0);
+		Simd4f edge1SqrLength = dot3(edge1, edge1);
+		Simd4f edge0DotEdge1 = dot3(edge0, edge1);
+		Simd4f normalInvLength = rsqrt(dot3(normal, normal));
+
+		Simd4f det = edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1;
+		Simd4f denom = edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1;
+
+		// there are definitely faster ways...
+		Simd4f aux = select(sMaskX, det, denom);
+		aux = select(sMaskZ, edge0SqrLength, aux);
+		aux = select(sMaskW, edge1SqrLength, aux);
+
+		storeAligned(&dIt->base.x, select(sMaskW, edge0DotEdge1, p0));
+		storeAligned(&dIt->edge0.x, select(sMaskW, edge0SqrLength, edge0));
+		storeAligned(&dIt->edge1.x, select(sMaskW, edge1SqrLength, edge1));
+		storeAligned(&dIt->normal.x, normal * normalInvLength);
+		storeAligned(&dIt->det, recip<1>(aux));
+	}
+}
+
+} // namespace
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::CollisionData::CollisionData()
+: mSpheres(0), mCones(0)
+{
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::SwCollision(SwClothData& clothData, SwKernelAllocator& alloc)
+: mClothData(clothData), mAllocator(alloc)
+{
+	allocate(mCurData);
+
+	if(mClothData.mEnableContinuousCollision || mClothData.mFrictionScale > 0.0f)
+	{
+		allocate(mPrevData);
+
+		generateSpheres(reinterpret_cast<Simd4f*>(mPrevData.mSpheres),
+		                reinterpret_cast<const Simd4f*>(clothData.mStartCollisionSpheres), clothData.mNumSpheres);
+
+		generateCones(mPrevData.mCones, mPrevData.mSpheres, clothData.mCapsuleIndices, clothData.mNumCapsules);
+	}
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::~SwCollision()
+{
+	deallocate(mCurData);
+	deallocate(mPrevData);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::operator()(const IterationState<Simd4f>& state)
+{
+	mNumCollisions = 0;
+
+	collideConvexes(state);  // discrete convex collision, no friction
+	collideTriangles(state); // discrete triangle collision, no friction
+
+	computeBounds();
+
+	if(!mClothData.mNumSpheres)
+		return;
+
+	bool lastIteration = state.mRemainingIterations == 1;
+
+	const Simd4f* targetSpheres = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionSpheres);
+
+	// generate sphere and cone collision data
+	if(!lastIteration)
+	{
+		// interpolate spheres
+		LerpIterator<Simd4f, const Simd4f*> pIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionSpheres),
+		                                          targetSpheres, state.getCurrentAlpha());
+		generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), pIter, mClothData.mNumSpheres);
+	}
+	else
+	{
+		// otherwise use the target spheres directly
+		generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), targetSpheres, mClothData.mNumSpheres);
+	}
+
+	// generate cones even if test below fails because
+	// continuous collision might need it in next iteration
+	generateCones(mCurData.mCones, mCurData.mSpheres, mClothData.mCapsuleIndices, mClothData.mNumCapsules);
+
+	if(buildAcceleration())
+	{
+		if(mClothData.mEnableContinuousCollision)
+			collideContinuousParticles();
+
+		mergeAcceleration(reinterpret_cast<uint32_t*>(mSphereGrid));
+		mergeAcceleration(reinterpret_cast<uint32_t*>(mConeGrid));
+
+		if(!mClothData.mEnableContinuousCollision)
+			collideParticles();
+
+		collideVirtualParticles();
+	}
+
+	if(mPrevData.mSpheres)
+		shdfnd::swap(mCurData, mPrevData);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+	size_t numTriangles = cloth.mStartCollisionTriangles.size();
+	size_t numPlanes = cloth.mStartCollisionPlanes.size();
+
+	const size_t kTriangleDataSize = sizeof(TriangleData) * numTriangles;
+	const size_t kPlaneDataSize = sizeof(PxVec4) * numPlanes * 2;
+
+	return PxMax(kTriangleDataSize, kPlaneDataSize);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimatePersistentMemory(const SwCloth& cloth)
+{
+	size_t numCapsules = cloth.mCapsuleIndices.size();
+	size_t numSpheres = cloth.mStartCollisionSpheres.size();
+
+	size_t sphereDataSize = sizeof(SphereData) * numSpheres * 2;
+	size_t coneDataSize = sizeof(ConeData) * numCapsules * 2;
+
+	return sphereDataSize + coneDataSize;
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::allocate(CollisionData& data)
+{
+	data.mSpheres = static_cast<SphereData*>(mAllocator.allocate(sizeof(SphereData) * mClothData.mNumSpheres));
+
+	data.mCones = static_cast<ConeData*>(mAllocator.allocate(sizeof(ConeData) * mClothData.mNumCapsules));
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::deallocate(const CollisionData& data)
+{
+	mAllocator.deallocate(data.mSpheres);
+	mAllocator.deallocate(data.mCones);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::computeBounds()
+{
+	PX_PROFILE_ZONE("cloth::SwSolverKernel::computeBounds", 0);
+
+	Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+	Simd4f floatMaxXYZ = -static_cast<Simd4f>(sMinusFloatMaxXYZ);
+
+	Simd4f lower = simd4f(FLT_MAX), upper = -lower;
+	for(; curIt < curEnd; ++curIt, ++prevIt)
+	{
+		Simd4f current = *curIt;
+		lower = min(lower, current);
+		upper = max(upper, current);
+		// if(current.w > 0) current.w = previous.w
+		*curIt = select(current > floatMaxXYZ, *prevIt, current);
+	}
+
+	BoundingBox<Simd4f> curBounds;
+	curBounds.mLower = lower;
+	curBounds.mUpper = upper;
+
+	// don't change this order, storeBounds writes 7 floats
+	BoundingBox<Simd4f> prevBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+	storeBounds(mClothData.mCurBounds, curBounds);
+	storeBounds(mClothData.mPrevBounds, prevBounds);
+}
+
+namespace
+{
+template <typename Simd4i>
+Simd4i andNotIsZero(const Simd4i& left, const Simd4i& right)
+{
+	return (left & ~right) == gSimd4iZero;
+}
+}
+
+// build per-axis mask arrays of spheres on the right/left of grid cell
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildSphereAcceleration(const SphereData* sIt)
+{
+	static const int maxIndex = sGridSize - 1;
+
+	const SphereData* sEnd = sIt + mClothData.mNumSpheres;
+	for(uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1)
+	{
+		Simd4f sphere = loadAligned(array(sIt->center));
+		Simd4f radius = splat<3>(sphere);
+
+		Simd4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero));
+		Simd4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
+
+		const int* firstIdx = array(first);
+		const int* lastIdx = array(last);
+
+		uint32_t* firstIt = reinterpret_cast<uint32_t*>(mSphereGrid);
+		uint32_t* lastIt = firstIt + 3 * sGridSize;
+
+		for(uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize)
+		{
+			for(int j = firstIdx[i]; j <= maxIndex; ++j)
+				firstIt[j] |= mask;
+
+			for(int j = lastIdx[i]; j >= 0; --j)
+				lastIt[j] |= mask;
+		}
+	}
+}
+
+// generate cone masks from sphere masks
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildConeAcceleration()
+{
+	const ConeData* coneIt = mCurData.mCones;
+	const ConeData* coneEnd = coneIt + mClothData.mNumCapsules;
+	for(uint32_t coneMask = 0x1; coneIt != coneEnd; ++coneIt, coneMask <<= 1)
+	{
+		if(coneIt->radius == 0.0f)
+			continue;
+
+		uint32_t spheresMask = coneIt->bothMask;
+
+		uint32_t* sphereIt = reinterpret_cast<uint32_t*>(mSphereGrid);
+		uint32_t* sphereEnd = sphereIt + 6 * sGridSize;
+		uint32_t* gridIt = reinterpret_cast<uint32_t*>(mConeGrid);
+		for(; sphereIt != sphereEnd; ++sphereIt, ++gridIt)
+			if(*sphereIt & spheresMask)
+				*gridIt |= coneMask;
+	}
+}
+
+// convert right/left mask arrays into single overlap array
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::mergeAcceleration(uint32_t* firstIt)
+{
+	uint32_t* firstEnd = firstIt + 3 * sGridSize;
+	uint32_t* lastIt = firstEnd;
+	for(; firstIt != firstEnd; ++firstIt, ++lastIt)
+		*firstIt &= *lastIt;
+}
+
+// build mask of spheres/cones touching a regular grid along each axis
+template <typename Simd4f>
+bool cloth::SwCollision<Simd4f>::buildAcceleration()
+{
+	// determine sphere bbox
+	BoundingBox<Simd4f> sphereBounds =
+	    expandBounds(emptyBounds<Simd4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres);
+	BoundingBox<Simd4f> particleBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+	if(mClothData.mEnableContinuousCollision)
+	{
+		sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres);
+		particleBounds = expandBounds(particleBounds, loadBounds<Simd4f>(mClothData.mPrevBounds));
+	}
+
+	BoundingBox<Simd4f> bounds = intersectBounds(sphereBounds, particleBounds);
+	Simd4f edgeLength = (bounds.mUpper - bounds.mLower) & ~static_cast<Simd4f>(sMaskW);
+	if(!allGreaterEqual(edgeLength, gSimd4fZero))
+		return false;
+
+	// calculate an expanded bounds to account for numerical inaccuracy
+	const Simd4f expandedLower = bounds.mLower - abs(bounds.mLower) * sGridExpand;
+	const Simd4f expandedUpper = bounds.mUpper + abs(bounds.mUpper) * sGridExpand;
+	const Simd4f expandedEdgeLength = max(expandedUpper - expandedLower, gSimd4fEpsilon);
+
+	// make grid minimal thickness and strict upper bound of spheres
+	mGridScale = sGridLength * recip<1>(expandedEdgeLength);
+	mGridBias = -expandedLower * mGridScale;
+	array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles()
+
+	PX_ASSERT(allTrue(((bounds.mLower * mGridScale + mGridBias) >= simd4f(0.0f)) | sMaskW));
+	PX_ASSERT(allTrue(((bounds.mUpper * mGridScale + mGridBias) < simd4f(8.0f)) | sMaskW));
+
+	memset(mSphereGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+	if(mClothData.mEnableContinuousCollision)
+		buildSphereAcceleration(mPrevData.mSpheres);
+	buildSphereAcceleration(mCurData.mSpheres);
+
+	memset(mConeGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+	buildConeAcceleration();
+
+	return true;
+}
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else
+#define FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator=(const ShapeMask& right)
+{
+	mCones = right.mCones;
+	mSpheres = right.mSpheres;
+	return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator&=(const ShapeMask& right)
+{
+	mCones = mCones & right.mCones;
+	mSpheres = mSpheres & right.mSpheres;
+	return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f& position, const Simd4i* __restrict sphereGrid,
+                                         const Simd4i* __restrict coneGrid)
+{
+	Gather<Simd4i> gather(intFloor(position));
+
+	ShapeMask result;
+	result.mCones = gather(coneGrid);
+	result.mSpheres = gather(sphereGrid);
+	return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict positions) const
+{
+	Simd4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias);
+	Simd4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias);
+	Simd4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias);
+
+	ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid);
+	result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2);
+	result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4);
+
+	return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict prevPos, const Simd4f* __restrict curPos) const
+{
+	Simd4f scaleX = splat<0>(mGridScale);
+	Simd4f scaleY = splat<1>(mGridScale);
+	Simd4f scaleZ = splat<2>(mGridScale);
+
+	Simd4f biasX = splat<0>(mGridBias);
+	Simd4f biasY = splat<1>(mGridBias);
+	Simd4f biasZ = splat<2>(mGridBias);
+
+	Simd4f prevX = prevPos[0] * scaleX + biasX;
+	Simd4f prevY = prevPos[1] * scaleY + biasY;
+	Simd4f prevZ = prevPos[2] * scaleZ + biasZ;
+
+	Simd4f curX = curPos[0] * scaleX + biasX;
+	Simd4f curY = curPos[1] * scaleY + biasY;
+	Simd4f curZ = curPos[2] * scaleZ + biasZ;
+
+	Simd4f maxX = min(max(prevX, curX), sGridLength);
+	Simd4f maxY = min(max(prevY, curY), sGridLength);
+	Simd4f maxZ = min(max(prevZ, curZ), sGridLength);
+
+	ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid);
+	result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2);
+	result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4);
+
+	Simd4f zero = gSimd4fZero;
+	Simd4f minX = max(min(prevX, curX), zero);
+	Simd4f minY = max(min(prevY, curY), zero);
+	Simd4f minZ = max(min(prevZ, curZ), zero);
+
+	result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6);
+	result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8);
+	result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10);
+
+	return result;
+}
+
+template <typename Simd4f>
+struct cloth::SwCollision<Simd4f>::ImpulseAccumulator
+{
+	ImpulseAccumulator()
+	: mDeltaX(gSimd4fZero)
+	, mDeltaY(mDeltaX)
+	, mDeltaZ(mDeltaX)
+	, mVelX(mDeltaX)
+	, mVelY(mDeltaX)
+	, mVelZ(mDeltaX)
+	, mNumCollisions(gSimd4fEpsilon)
+	{
+	}
+
+	void add(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & x) == (mask & x)));
+		PX_ASSERT(allTrue((mask & y) == (mask & y)));
+		PX_ASSERT(allTrue((mask & z) == (mask & z)));
+		PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+		Simd4f maskedScale = scale & mask;
+		mDeltaX = mDeltaX + x * maskedScale;
+		mDeltaY = mDeltaY + y * maskedScale;
+		mDeltaZ = mDeltaZ + z * maskedScale;
+		mNumCollisions = mNumCollisions + (gSimd4fOne & mask);
+	}
+
+	void addVelocity(const Simd4f& vx, const Simd4f& vy, const Simd4f& vz, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & vx) == (mask & vx)));
+		PX_ASSERT(allTrue((mask & vy) == (mask & vy)));
+		PX_ASSERT(allTrue((mask & vz) == (mask & vz)));
+
+		mVelX = mVelX + (vx & mask);
+		mVelY = mVelY + (vy & mask);
+		mVelZ = mVelZ + (vz & mask);
+	}
+
+	void subtract(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & x) == (mask & x)));
+		PX_ASSERT(allTrue((mask & y) == (mask & y)));
+		PX_ASSERT(allTrue((mask & z) == (mask & z)));
+		PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+		Simd4f maskedScale = scale & mask;
+		mDeltaX = mDeltaX - x * maskedScale;
+		mDeltaY = mDeltaY - y * maskedScale;
+		mDeltaZ = mDeltaZ - z * maskedScale;
+		mNumCollisions = mNumCollisions + (gSimd4fOne & mask);
+	}
+
+	Simd4f mDeltaX, mDeltaY, mDeltaZ;
+	Simd4f mVelX, mVelY, mVelZ;
+	Simd4f mNumCollisions;
+};
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* positions,
+                                                             ImpulseAccumulator& accum) const
+{
+	const float* __restrict spherePtr = array(mCurData.mSpheres->center);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	Simd4i mask4 = horizontalOr(sphereMask);
+	uint32_t mask = uint32_t(array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+		mask = mask & test;
+
+		Simd4f sphere = loadAligned(spherePtr, offset);
+
+		Simd4f deltaX = positions[0] - splat<0>(sphere);
+		Simd4f deltaY = positions[1] - splat<1>(sphere);
+		Simd4f deltaZ = positions[2] - splat<2>(sphere);
+
+		Simd4f sqrDistance = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+		Simd4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * splat<3>(sphere);
+
+		Simd4f contactMask;
+		if(!anyGreater(gSimd4fZero, negativeScale, contactMask))
+			continue;
+
+		accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask);
+
+		if(frictionEnabled)
+		{
+			// load previous sphere pos
+			const float* __restrict prevSpherePtr = array(mPrevData.mSpheres->center);
+
+			Simd4f prevSphere = loadAligned(prevSpherePtr, offset);
+			Simd4f velocity = sphere - prevSphere;
+
+			accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+		}
+	}
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict positions, ImpulseAccumulator& accum) const
+{
+	const float* __restrict centerPtr = array(mCurData.mCones->center);
+	const float* __restrict axisPtr = array(mCurData.mCones->axis);
+	const int32_t* __restrict auxiliaryPtr = reinterpret_cast<const int32_t*>(&mCurData.mCones->sqrCosine);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	ShapeMask shapeMask = getShapeMask(positions);
+	Simd4i mask4 = horizontalOr(shapeMask.mCones);
+	uint32_t mask = uint32_t(array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t coneIndex = findBitSet(mask & ~test);
+		uint32_t offset = coneIndex * sizeof(ConeData);
+		mask = mask & test;
+
+		Simd4i test4 = mask4 - gSimd4iOne;
+		Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+		mask4 = mask4 & test4;
+
+		Simd4f center = loadAligned(centerPtr, offset);
+
+		Simd4f deltaX = positions[0] - splat<0>(center);
+		Simd4f deltaY = positions[1] - splat<1>(center);
+		Simd4f deltaZ = positions[2] - splat<2>(center);
+
+		Simd4f axis = loadAligned(axisPtr, offset);
+
+		Simd4f axisX = splat<0>(axis);
+		Simd4f axisY = splat<1>(axis);
+		Simd4f axisZ = splat<2>(axis);
+		Simd4f slope = splat<3>(axis);
+
+		Simd4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+		Simd4f radius = dot * slope + splat<3>(center);
+
+		// set radius to zero if cone is culled
+		radius = max(radius, gSimd4fZero) & ~culled;
+
+		Simd4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
+
+		Simd4i auxiliary = loadAligned(auxiliaryPtr, offset);
+		Simd4i bothMask = splat<3>(auxiliary);
+
+		Simd4f contactMask;
+		if(!anyGreater(radius * radius, sqrDistance, contactMask))
+		{
+			// cone only culled when spheres culled, ok to clear those too
+			shapeMask.mSpheres = shapeMask.mSpheres & ~bothMask;
+			continue;
+		}
+
+		// clamp to a small positive epsilon to avoid numerical error
+		// making sqrDistance negative when point lies on the cone axis
+		sqrDistance = max(sqrDistance, gSimd4fEpsilon);
+
+		Simd4f invDistance = rsqrt(sqrDistance);
+		Simd4f base = dot + slope * sqrDistance * invDistance;
+
+		// force left/rightMask to false if not inside cone
+		base = base & contactMask;
+
+		Simd4f halfLength = splat<1>(simd4f(auxiliary));
+		Simd4i leftMask = simd4i(base < -halfLength);
+		Simd4i rightMask = simd4i(base > halfLength);
+
+		// we use both mask because of the early out above.
+		Simd4i firstMask = splat<2>(auxiliary);
+		Simd4i secondMask = firstMask ^ bothMask;
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask);
+
+		deltaX = deltaX - base * axisX;
+		deltaY = deltaY - base * axisY;
+		deltaZ = deltaZ - base * axisZ;
+
+		Simd4f sqrCosine = splat<0>(simd4f(auxiliary));
+		Simd4f scale = radius * invDistance * sqrCosine - sqrCosine;
+
+		contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+		if(!anyTrue(contactMask))
+			continue;
+
+		accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+		if(frictionEnabled)
+		{
+			uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+			uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+			float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+			float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+			// todo: could pre-compute sphere velocities or it might be
+			// faster to compute cur/prev sphere positions directly
+			Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+			Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+			Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+			Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+			Simd4f v0 = s0p1 - s0p0;
+			Simd4f v1 = s1p1 - s1p0;
+			Simd4f vd = v1 - v0;
+
+			// dot is in the range -1 to 1, scale and bias to 0 to 1
+			dot = dot * gSimd4fHalf + gSimd4fHalf;
+
+			// interpolate velocity at contact points
+			Simd4f vx = splat<0>(v0) + dot * splat<0>(vd);
+			Simd4f vy = splat<1>(v0) + dot * splat<1>(vd);
+			Simd4f vz = splat<2>(v0) + dot * splat<2>(vd);
+
+			accum.addVelocity(vx, vy, vz, contactMask);
+		}
+	}
+
+	return shapeMask.mSpheres;
+}
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* __restrict prevPos,
+                                                             Simd4f* __restrict curPos, ImpulseAccumulator& accum) const
+{
+	const float* __restrict prevSpheres = array(mPrevData.mSpheres->center);
+	const float* __restrict curSpheres = array(mCurData.mSpheres->center);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	Simd4i mask4 = horizontalOr(sphereMask);
+	uint32_t mask = uint32_t(array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+		mask = mask & test;
+
+		Simd4f prevSphere = loadAligned(prevSpheres, offset);
+		Simd4f prevX = prevPos[0] - splat<0>(prevSphere);
+		Simd4f prevY = prevPos[1] - splat<1>(prevSphere);
+		Simd4f prevZ = prevPos[2] - splat<2>(prevSphere);
+		Simd4f prevRadius = splat<3>(prevSphere);
+
+		Simd4f curSphere = loadAligned(curSpheres, offset);
+		Simd4f curX = curPos[0] - splat<0>(curSphere);
+		Simd4f curY = curPos[1] - splat<1>(curSphere);
+		Simd4f curZ = curPos[2] - splat<2>(curSphere);
+		Simd4f curRadius = splat<3>(curSphere);
+
+		Simd4f sqrDistance = gSimd4fEpsilon + curX * curX + curY * curY + curZ * curZ;
+
+		Simd4f dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius;
+		Simd4f dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius;
+		Simd4f dotCurCur = sqrDistance - curRadius * curRadius;
+
+		Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+		Simd4f sqrtD = sqrt(discriminant);
+		Simd4f halfB = dotPrevCur - dotPrevPrev;
+		Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+		// time of impact or 0 if prevPos inside sphere
+		Simd4f toi = recip(minusA) * min(gSimd4fZero, halfB + sqrtD);
+		Simd4f collisionMask = (toi < gSimd4fOne) & (halfB < sqrtD);
+
+		// skip continuous collision if the (un-clamped) particle
+		// trajectory only touches the outer skin of the cone.
+		Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+		collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+		// a is negative when one sphere is contained in the other,
+		// which is already handled by discrete collision.
+		collisionMask = collisionMask & (minusA < -static_cast<Simd4f>(gSimd4fEpsilon));
+
+		if(!allEqual(collisionMask, gSimd4fZero))
+		{
+			Simd4f deltaX = prevX - curX;
+			Simd4f deltaY = prevY - curY;
+			Simd4f deltaZ = prevZ - curZ;
+
+			Simd4f oneMinusToi = (gSimd4fOne - toi) & collisionMask;
+
+			// reduce ccd impulse if (clamped) particle trajectory stays in sphere skin,
+			// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+			Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
+			oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
+
+			curX = curX + deltaX * oneMinusToi;
+			curY = curY + deltaY * oneMinusToi;
+			curZ = curZ + deltaZ * oneMinusToi;
+
+			curPos[0] = splat<0>(curSphere) + curX;
+			curPos[1] = splat<1>(curSphere) + curY;
+			curPos[2] = splat<2>(curSphere) + curZ;
+
+			sqrDistance = gSimd4fEpsilon + curX * curX + curY * curY + curZ * curZ;
+		}
+
+		Simd4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * curRadius;
+
+		Simd4f contactMask;
+		if(!anyGreater(gSimd4fZero, negativeScale, contactMask))
+			continue;
+
+		accum.subtract(curX, curY, curZ, negativeScale, contactMask);
+
+		if(frictionEnabled)
+		{
+			Simd4f velocity = curSphere - prevSphere;
+			accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+		}
+	}
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict prevPos, Simd4f* __restrict curPos,
+                                         ImpulseAccumulator& accum) const
+{
+	const float* __restrict prevCenterPtr = array(mPrevData.mCones->center);
+	const float* __restrict prevAxisPtr = array(mPrevData.mCones->axis);
+	const int32_t* __restrict prevAuxiliaryPtr = reinterpret_cast<const int32_t*>(&mPrevData.mCones->sqrCosine);
+
+	const float* __restrict curCenterPtr = array(mCurData.mCones->center);
+	const float* __restrict curAxisPtr = array(mCurData.mCones->axis);
+	const int32_t* __restrict curAuxiliaryPtr = reinterpret_cast<const int32_t*>(&mCurData.mCones->sqrCosine);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	ShapeMask shapeMask = getShapeMask(prevPos, curPos);
+	Simd4i mask4 = horizontalOr(shapeMask.mCones);
+	uint32_t mask = uint32_t(array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t coneIndex = findBitSet(mask & ~test);
+		uint32_t offset = coneIndex * sizeof(ConeData);
+		mask = mask & test;
+
+		Simd4i test4 = mask4 - gSimd4iOne;
+		Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+		mask4 = mask4 & test4;
+
+		Simd4f prevCenter = loadAligned(prevCenterPtr, offset);
+		Simd4f prevAxis = loadAligned(prevAxisPtr, offset);
+		Simd4f prevAxisX = splat<0>(prevAxis);
+		Simd4f prevAxisY = splat<1>(prevAxis);
+		Simd4f prevAxisZ = splat<2>(prevAxis);
+		Simd4f prevSlope = splat<3>(prevAxis);
+
+		Simd4f prevX = prevPos[0] - splat<0>(prevCenter);
+		Simd4f prevY = prevPos[1] - splat<1>(prevCenter);
+		Simd4f prevZ = prevPos[2] - splat<2>(prevCenter);
+		Simd4f prevT = prevY * prevAxisZ - prevZ * prevAxisY;
+		Simd4f prevU = prevZ * prevAxisX - prevX * prevAxisZ;
+		Simd4f prevV = prevX * prevAxisY - prevY * prevAxisX;
+		Simd4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+		Simd4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter);
+
+		Simd4f curCenter = loadAligned(curCenterPtr, offset);
+		Simd4f curAxis = loadAligned(curAxisPtr, offset);
+		Simd4f curAxisX = splat<0>(curAxis);
+		Simd4f curAxisY = splat<1>(curAxis);
+		Simd4f curAxisZ = splat<2>(curAxis);
+		Simd4f curSlope = splat<3>(curAxis);
+		Simd4i curAuxiliary = loadAligned(curAuxiliaryPtr, offset);
+
+		Simd4f curX = curPos[0] - splat<0>(curCenter);
+		Simd4f curY = curPos[1] - splat<1>(curCenter);
+		Simd4f curZ = curPos[2] - splat<2>(curCenter);
+		Simd4f curT = curY * curAxisZ - curZ * curAxisY;
+		Simd4f curU = curZ * curAxisX - curX * curAxisZ;
+		Simd4f curV = curX * curAxisY - curY * curAxisX;
+		Simd4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+		Simd4f curRadius = curDot * curSlope + splat<3>(curCenter);
+
+		Simd4f curSqrDistance = gSimd4fEpsilon + curT * curT + curU * curU + curV * curV;
+
+		// set radius to zero if cone is culled
+		prevRadius = max(prevRadius, gSimd4fZero) & ~culled;
+		curRadius = max(curRadius, gSimd4fZero) & ~culled;
+
+		Simd4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
+		Simd4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
+		Simd4f dotCurCur = curSqrDistance - curRadius * curRadius;
+
+		Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+		Simd4f sqrtD = sqrt(discriminant);
+		Simd4f halfB = dotPrevCur - dotPrevPrev;
+		Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+		// time of impact or 0 if prevPos inside cone
+		Simd4f toi = recip(minusA) * min(gSimd4fZero, halfB + sqrtD);
+		Simd4f collisionMask = (toi < gSimd4fOne) & (halfB < sqrtD);
+
+		// skip continuous collision if the (un-clamped) particle
+		// trajectory only touches the outer skin of the cone.
+		Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+		collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+		// a is negative when one cone is contained in the other,
+		// which is already handled by discrete collision.
+		collisionMask = collisionMask & (minusA < -static_cast<Simd4f>(gSimd4fEpsilon));
+
+		// test if any particle hits infinite cone (and 0<time of impact<1)
+		if(!allEqual(collisionMask, gSimd4fZero))
+		{
+			Simd4f deltaX = prevX - curX;
+			Simd4f deltaY = prevY - curY;
+			Simd4f deltaZ = prevZ - curZ;
+
+			// interpolate delta at toi
+			Simd4f posX = prevX - deltaX * toi;
+			Simd4f posY = prevY - deltaY * toi;
+			Simd4f posZ = prevZ - deltaZ * toi;
+
+			Simd4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary));
+			Simd4i prevAuxiliary = loadAligned(prevAuxiliaryPtr, offset);
+			Simd4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary));
+
+			Simd4f oneMinusToi = gSimd4fOne - toi;
+
+			// interpolate axis at toi
+			Simd4f axisX = splat<0>(curScaledAxis) - splat<0>(deltaScaledAxis) * oneMinusToi;
+			Simd4f axisY = splat<1>(curScaledAxis) - splat<1>(deltaScaledAxis) * oneMinusToi;
+			Simd4f axisZ = splat<2>(curScaledAxis) - splat<2>(deltaScaledAxis) * oneMinusToi;
+			Simd4f slope = (prevSlope * oneMinusToi + curSlope * toi);
+
+			Simd4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+			Simd4f invHalfLength = rsqrt(sqrHalfLength);
+			Simd4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
+
+			Simd4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
+			Simd4f invDistance = rsqrt(sqrDistance) & (sqrDistance > gSimd4fZero);
+
+			Simd4f base = dot + slope * sqrDistance * invDistance;
+			Simd4f scale = base * invHalfLength & collisionMask;
+
+			Simd4f cullMask = (abs(scale) < gSimd4fOne) & collisionMask;
+
+			// test if any impact position is in cone section
+			if(!allEqual(cullMask, gSimd4fZero))
+			{
+				deltaX = deltaX + splat<0>(deltaScaledAxis) * scale;
+				deltaY = deltaY + splat<1>(deltaScaledAxis) * scale;
+				deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale;
+
+				oneMinusToi = oneMinusToi & cullMask;
+
+				// reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+				// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+				// oneMinusToi = oneMinusToi * recip(gSimd4fOne - sqrtD * recip(minusA * oneMinusToi));
+				Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon);
+				oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK);
+
+				curX = curX + deltaX * oneMinusToi;
+				curY = curY + deltaY * oneMinusToi;
+				curZ = curZ + deltaZ * oneMinusToi;
+
+				curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+				curRadius = curDot * curSlope + splat<3>(curCenter);
+				curRadius = max(curRadius, gSimd4fZero) & ~culled;
+				curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
+
+				curPos[0] = splat<0>(curCenter) + curX;
+				curPos[1] = splat<1>(curCenter) + curY;
+				curPos[2] = splat<2>(curCenter) + curZ;
+			}
+		}
+
+		// curPos inside cone (discrete collision)
+		Simd4f contactMask;
+		int anyContact = anyGreater(curRadius * curRadius, curSqrDistance, contactMask);
+
+		Simd4i bothMask = splat<3>(curAuxiliary);
+
+		// instead of culling continuous collision for ~collisionMask, and discrete
+		// collision for ~contactMask, disable both if ~collisionMask & ~contactMask
+		Simd4i cullMask = bothMask & ~simd4i(collisionMask | contactMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~cullMask;
+
+		if(!anyContact)
+			continue;
+
+		Simd4f invDistance = rsqrt(curSqrDistance) & (curSqrDistance > gSimd4fZero);
+		Simd4f base = curDot + curSlope * curSqrDistance * invDistance;
+
+		Simd4f halfLength = splat<1>(simd4f(curAuxiliary));
+		Simd4i leftMask = simd4i(base < -halfLength);
+		Simd4i rightMask = simd4i(base > halfLength);
+
+		// can only skip continuous sphere collision if post-ccd position
+		// is on code side *and* particle had cone-ccd collision.
+		Simd4i firstMask = splat<2>(curAuxiliary);
+		Simd4i secondMask = firstMask ^ bothMask;
+		cullMask = (firstMask & ~leftMask) | (secondMask & ~rightMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(cullMask & simd4i(collisionMask));
+
+		Simd4f deltaX = curX - base * curAxisX;
+		Simd4f deltaY = curY - base * curAxisY;
+		Simd4f deltaZ = curZ - base * curAxisZ;
+
+		Simd4f sqrCosine = splat<0>(simd4f(curAuxiliary));
+		Simd4f scale = curRadius * invDistance * sqrCosine - sqrCosine;
+
+		contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+		if(!anyTrue(contactMask))
+			continue;
+
+		accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+		if(frictionEnabled)
+		{
+			uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+			uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+			float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+			float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+			// todo: could pre-compute sphere velocities or it might be
+			// faster to compute cur/prev sphere positions directly
+			Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+			Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+			Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+			Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+			Simd4f v0 = s0p1 - s0p0;
+			Simd4f v1 = s1p1 - s1p0;
+			Simd4f vd = v1 - v0;
+
+			// dot is in the range -1 to 1, scale and bias to 0 to 1
+			curDot = curDot * gSimd4fHalf + gSimd4fHalf;
+
+			// interpolate velocity at contact points
+			Simd4f vx = splat<0>(v0) + curDot * splat<0>(vd);
+			Simd4f vy = splat<1>(v0) + curDot * splat<1>(vd);
+			Simd4f vz = splat<2>(v0) + curDot * splat<2>(vd);
+
+			accum.addVelocity(vx, vy, vz, contactMask);
+		}
+	}
+
+	return shapeMask.mSpheres;
+}
+
+namespace
+{
+
+template <typename Simd4f>
+PX_INLINE void calculateFrictionImpulse(const Simd4f& deltaX, const Simd4f& deltaY, const Simd4f& deltaZ,
+                                        const Simd4f& velX, const Simd4f& velY, const Simd4f& velZ,
+                                        const Simd4f* curPos, const Simd4f* prevPos, const Simd4f& scale,
+                                        const Simd4f& coefficient, const Simd4f& mask, Simd4f* impulse)
+{
+	// calculate collision normal
+	Simd4f deltaSq = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+	Simd4f rcpDelta = rsqrt(deltaSq + gSimd4fEpsilon);
+
+	Simd4f nx = deltaX * rcpDelta;
+	Simd4f ny = deltaY * rcpDelta;
+	Simd4f nz = deltaZ * rcpDelta;
+
+	// calculate relative velocity scaled by number of collisions
+	Simd4f rvx = curPos[0] - prevPos[0] - velX * scale;
+	Simd4f rvy = curPos[1] - prevPos[1] - velY * scale;
+	Simd4f rvz = curPos[2] - prevPos[2] - velZ * scale;
+
+	// calculate magnitude of relative normal velocity
+	Simd4f rvn = rvx * nx + rvy * ny + rvz * nz;
+
+	// calculate relative tangential velocity
+	Simd4f rvtx = rvx - rvn * nx;
+	Simd4f rvty = rvy - rvn * ny;
+	Simd4f rvtz = rvz - rvn * nz;
+
+	// calculate magnitude of vt
+	Simd4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon);
+
+	// magnitude of friction impulse (cannot be greater than -vt)
+	Simd4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, gSimd4fMinusOne) & mask;
+
+	impulse[0] = rvtx * j;
+	impulse[1] = rvty * j;
+	impulse[2] = rvtz * j;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideParticles()
+{
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	Simd4f curPos[4];
+	Simd4f prevPos[4];
+
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	float* __restrict pIt = mClothData.mCurParticles;
+	float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+	for(; pIt < pEnd; pIt += 16, prevIt += 16)
+	{
+		curPos[0] = loadAligned(pIt, 0);
+		curPos[1] = loadAligned(pIt, 16);
+		curPos[2] = loadAligned(pIt, 32);
+		curPos[3] = loadAligned(pIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(curPos, accum);
+		collideSpheres(sphereMask, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			prevPos[0] = loadAligned(prevIt, 0);
+			prevPos[1] = loadAligned(prevIt, 16);
+			prevPos[2] = loadAligned(prevIt, 32);
+			prevPos[3] = loadAligned(prevIt, 48);
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse magnitude
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f scale = recip(gSimd4fOne + massScale * dSq);
+
+			// scale invmass
+			curPos[3] = select(mask, curPos[3] * scale, curPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(pIt, 0, curPos[0]);
+		storeAligned(pIt, 16, curPos[1]);
+		storeAligned(pIt, 32, curPos[2]);
+		storeAligned(pIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideVirtualParticles()
+{
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	Simd4f curPos[3];
+
+	const float* __restrict weights = mClothData.mVirtualParticleWeights;
+	float* __restrict particles = mClothData.mCurParticles;
+	float* __restrict prevParticles = mClothData.mPrevParticles;
+
+	// move dummy particles outside of collision range
+	Simd4f* __restrict dummy = mClothData.mNumParticles + reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f invGridScale = recip(mGridScale) & (mGridScale > gSimd4fEpsilon);
+	dummy[0] = dummy[1] = dummy[2] = invGridScale * mGridBias - invGridScale;
+
+	const uint16_t* __restrict vpIt = mClothData.mVirtualParticlesBegin;
+	const uint16_t* __restrict vpEnd = mClothData.mVirtualParticlesEnd;
+	for(; vpIt != vpEnd; vpIt += 16)
+	{
+		// load 12 particles and 4 weights
+		Simd4f p0v0 = loadAligned(particles, vpIt[0] * sizeof(PxVec4));
+		Simd4f p0v1 = loadAligned(particles, vpIt[1] * sizeof(PxVec4));
+		Simd4f p0v2 = loadAligned(particles, vpIt[2] * sizeof(PxVec4));
+		Simd4f w0 = loadAligned(weights, vpIt[3] * sizeof(PxVec4));
+
+		Simd4f p1v0 = loadAligned(particles, vpIt[4] * sizeof(PxVec4));
+		Simd4f p1v1 = loadAligned(particles, vpIt[5] * sizeof(PxVec4));
+		Simd4f p1v2 = loadAligned(particles, vpIt[6] * sizeof(PxVec4));
+		Simd4f w1 = loadAligned(weights, vpIt[7] * sizeof(PxVec4));
+
+		Simd4f p2v0 = loadAligned(particles, vpIt[8] * sizeof(PxVec4));
+		Simd4f p2v1 = loadAligned(particles, vpIt[9] * sizeof(PxVec4));
+		Simd4f p2v2 = loadAligned(particles, vpIt[10] * sizeof(PxVec4));
+		Simd4f w2 = loadAligned(weights, vpIt[11] * sizeof(PxVec4));
+
+		Simd4f p3v1 = loadAligned(particles, vpIt[13] * sizeof(PxVec4));
+		Simd4f p3v0 = loadAligned(particles, vpIt[12] * sizeof(PxVec4));
+		Simd4f p3v2 = loadAligned(particles, vpIt[14] * sizeof(PxVec4));
+		Simd4f w3 = loadAligned(weights, vpIt[15] * sizeof(PxVec4));
+
+		// interpolate particles and transpose
+		Simd4f px = p0v0 * splat<0>(w0) + p0v1 * splat<1>(w0) + p0v2 * splat<2>(w0);
+		Simd4f py = p1v0 * splat<0>(w1) + p1v1 * splat<1>(w1) + p1v2 * splat<2>(w1);
+		Simd4f pz = p2v0 * splat<0>(w2) + p2v1 * splat<1>(w2) + p2v2 * splat<2>(w2);
+		Simd4f pw = p3v0 * splat<0>(w3) + p3v1 * splat<1>(w3) + p3v2 * splat<2>(w3);
+		transpose(px, py, pz, pw);
+
+		curPos[0] = px;
+		curPos[1] = py;
+		curPos[2] = pz;
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(curPos, accum);
+		collideSpheres(sphereMask, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		// displacement and transpose back
+		Simd4f d0 = accum.mDeltaX * invNumCollisions;
+		Simd4f d1 = accum.mDeltaY * invNumCollisions;
+		Simd4f d2 = accum.mDeltaZ * invNumCollisions;
+		Simd4f d3 = gSimd4fZero;
+		transpose(d0, d1, d2, d3);
+
+		// scale weights by 1/dot(w,w)
+		Simd4f rw0 = w0 * splat<3>(w0);
+		Simd4f rw1 = w1 * splat<3>(w1);
+		Simd4f rw2 = w2 * splat<3>(w2);
+		Simd4f rw3 = w3 * splat<3>(w3);
+
+		if(frictionEnabled)
+		{
+			Simd4f q0v0 = loadAligned(prevParticles, vpIt[0] * sizeof(PxVec4));
+			Simd4f q0v1 = loadAligned(prevParticles, vpIt[1] * sizeof(PxVec4));
+			Simd4f q0v2 = loadAligned(prevParticles, vpIt[2] * sizeof(PxVec4));
+
+			Simd4f q1v0 = loadAligned(prevParticles, vpIt[4] * sizeof(PxVec4));
+			Simd4f q1v1 = loadAligned(prevParticles, vpIt[5] * sizeof(PxVec4));
+			Simd4f q1v2 = loadAligned(prevParticles, vpIt[6] * sizeof(PxVec4));
+
+			Simd4f q2v0 = loadAligned(prevParticles, vpIt[8] * sizeof(PxVec4));
+			Simd4f q2v1 = loadAligned(prevParticles, vpIt[9] * sizeof(PxVec4));
+			Simd4f q2v2 = loadAligned(prevParticles, vpIt[10] * sizeof(PxVec4));
+
+			Simd4f q3v0 = loadAligned(prevParticles, vpIt[12] * sizeof(PxVec4));
+			Simd4f q3v1 = loadAligned(prevParticles, vpIt[13] * sizeof(PxVec4));
+			Simd4f q3v2 = loadAligned(prevParticles, vpIt[14] * sizeof(PxVec4));
+
+			// calculate previous interpolated positions
+			Simd4f qx = q0v0 * splat<0>(w0) + q0v1 * splat<1>(w0) + q0v2 * splat<2>(w0);
+			Simd4f qy = q1v0 * splat<0>(w1) + q1v1 * splat<1>(w1) + q1v2 * splat<2>(w1);
+			Simd4f qz = q2v0 * splat<0>(w2) + q2v1 * splat<1>(w2) + q2v2 * splat<2>(w2);
+			Simd4f qw = q3v0 * splat<0>(w3) + q3v1 * splat<1>(w3) + q3v2 * splat<2>(w3);
+			transpose(qx, qy, qz, qw);
+
+			Simd4f prevPos[3] = { qx, qy, qz };
+			Simd4f frictionImpulse[4];
+			frictionImpulse[3] = gSimd4fZero;
+
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]);
+
+			q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]);
+			q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]);
+			q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]);
+
+			q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]);
+			q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]);
+			q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]);
+
+			q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]);
+			q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]);
+			q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]);
+
+			q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]);
+			q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]);
+			q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]);
+
+			// write back prev particles
+			storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0);
+			storeAligned(prevParticles, vpIt[1] * sizeof(PxVec4), q0v1);
+			storeAligned(prevParticles, vpIt[2] * sizeof(PxVec4), q0v2);
+
+			storeAligned(prevParticles, vpIt[4] * sizeof(PxVec4), q1v0);
+			storeAligned(prevParticles, vpIt[5] * sizeof(PxVec4), q1v1);
+			storeAligned(prevParticles, vpIt[6] * sizeof(PxVec4), q1v2);
+
+			storeAligned(prevParticles, vpIt[8] * sizeof(PxVec4), q2v0);
+			storeAligned(prevParticles, vpIt[9] * sizeof(PxVec4), q2v1);
+			storeAligned(prevParticles, vpIt[10] * sizeof(PxVec4), q2v2);
+
+			storeAligned(prevParticles, vpIt[12] * sizeof(PxVec4), q3v0);
+			storeAligned(prevParticles, vpIt[13] * sizeof(PxVec4), q3v1);
+			storeAligned(prevParticles, vpIt[14] * sizeof(PxVec4), q3v2);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f weightScale = recip(gSimd4fOne + massScale * dSq);
+
+			weightScale = weightScale - gSimd4fOne;
+			Simd4f s0 = gSimd4fOne + splat<0>(weightScale) * (w0 & splat<0>(mask));
+			Simd4f s1 = gSimd4fOne + splat<1>(weightScale) * (w1 & splat<1>(mask));
+			Simd4f s2 = gSimd4fOne + splat<2>(weightScale) * (w2 & splat<2>(mask));
+			Simd4f s3 = gSimd4fOne + splat<3>(weightScale) * (w3 & splat<3>(mask));
+
+			p0v0 = p0v0 * (gSimd4fOneXYZ | (splat<0>(s0) & sMaskW));
+			p0v1 = p0v1 * (gSimd4fOneXYZ | (splat<1>(s0) & sMaskW));
+			p0v2 = p0v2 * (gSimd4fOneXYZ | (splat<2>(s0) & sMaskW));
+
+			p1v0 = p1v0 * (gSimd4fOneXYZ | (splat<0>(s1) & sMaskW));
+			p1v1 = p1v1 * (gSimd4fOneXYZ | (splat<1>(s1) & sMaskW));
+			p1v2 = p1v2 * (gSimd4fOneXYZ | (splat<2>(s1) & sMaskW));
+
+			p2v0 = p2v0 * (gSimd4fOneXYZ | (splat<0>(s2) & sMaskW));
+			p2v1 = p2v1 * (gSimd4fOneXYZ | (splat<1>(s2) & sMaskW));
+			p2v2 = p2v2 * (gSimd4fOneXYZ | (splat<2>(s2) & sMaskW));
+
+			p3v0 = p3v0 * (gSimd4fOneXYZ | (splat<0>(s3) & sMaskW));
+			p3v1 = p3v1 * (gSimd4fOneXYZ | (splat<1>(s3) & sMaskW));
+			p3v2 = p3v2 * (gSimd4fOneXYZ | (splat<2>(s3) & sMaskW));
+		}
+
+		p0v0 = p0v0 + (splat<0>(rw0) * d0);
+		p0v1 = p0v1 + (splat<1>(rw0) * d0);
+		p0v2 = p0v2 + (splat<2>(rw0) * d0);
+
+		p1v0 = p1v0 + (splat<0>(rw1) * d1);
+		p1v1 = p1v1 + (splat<1>(rw1) * d1);
+		p1v2 = p1v2 + (splat<2>(rw1) * d1);
+
+		p2v0 = p2v0 + (splat<0>(rw2) * d2);
+		p2v1 = p2v1 + (splat<1>(rw2) * d2);
+		p2v2 = p2v2 + (splat<2>(rw2) * d2);
+
+		p3v0 = p3v0 + (splat<0>(rw3) * d3);
+		p3v1 = p3v1 + (splat<1>(rw3) * d3);
+		p3v2 = p3v2 + (splat<2>(rw3) * d3);
+
+		// write back particles
+		storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0);
+		storeAligned(particles, vpIt[1] * sizeof(PxVec4), p0v1);
+		storeAligned(particles, vpIt[2] * sizeof(PxVec4), p0v2);
+
+		storeAligned(particles, vpIt[4] * sizeof(PxVec4), p1v0);
+		storeAligned(particles, vpIt[5] * sizeof(PxVec4), p1v1);
+		storeAligned(particles, vpIt[6] * sizeof(PxVec4), p1v2);
+
+		storeAligned(particles, vpIt[8] * sizeof(PxVec4), p2v0);
+		storeAligned(particles, vpIt[9] * sizeof(PxVec4), p2v1);
+		storeAligned(particles, vpIt[10] * sizeof(PxVec4), p2v2);
+
+		storeAligned(particles, vpIt[12] * sizeof(PxVec4), p3v0);
+		storeAligned(particles, vpIt[13] * sizeof(PxVec4), p3v1);
+		storeAligned(particles, vpIt[14] * sizeof(PxVec4), p3v2);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideContinuousParticles()
+{
+	Simd4f curPos[4];
+	Simd4f prevPos[4];
+
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	float* __restrict curIt = mClothData.mCurParticles;
+	float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+
+	for(; curIt < curEnd; curIt += 16, prevIt += 16)
+	{
+		prevPos[0] = loadAligned(prevIt, 0);
+		prevPos[1] = loadAligned(prevIt, 16);
+		prevPos[2] = loadAligned(prevIt, 32);
+		prevPos[3] = loadAligned(prevIt, 48);
+		transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+		curPos[0] = loadAligned(curIt, 0);
+		curPos[1] = loadAligned(curIt, 16);
+		curPos[2] = loadAligned(curIt, 32);
+		curPos[3] = loadAligned(curIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(prevPos, curPos, accum);
+		collideSpheres(sphereMask, prevPos, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse magnitude
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f weightScale = recip(gSimd4fOne + massScale * dSq);
+
+			// scale invmass
+			curPos[3] = select(mask, curPos[3] * weightScale, curPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(curIt, 0, curPos[0]);
+		storeAligned(curIt, 16, curPos[1]);
+		storeAligned(curIt, 32, curPos[2]);
+		storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const IterationState<Simd4f>& state)
+{
+	if(!mClothData.mNumConvexes)
+		return;
+
+	// times 2 for plane equation result buffer
+	Simd4f* planes = static_cast<Simd4f*>(mAllocator.allocate(sizeof(Simd4f) * mClothData.mNumPlanes * 2));
+
+	const Simd4f* targetPlanes = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionPlanes);
+
+	// generate plane collision data
+	if(state.mRemainingIterations != 1)
+	{
+		// interpolate planes
+		LerpIterator<Simd4f, const Simd4f*> planeIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionPlanes),
+		                                              targetPlanes, state.getCurrentAlpha());
+
+		// todo: normalize plane equations
+		generatePlanes(planes, planeIter, mClothData.mNumPlanes);
+	}
+	else
+	{
+		// otherwise use the target planes directly
+		generatePlanes(planes, targetPlanes, mClothData.mNumPlanes);
+	}
+
+	Simd4f curPos[4], prevPos[4];
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	float* __restrict curIt = mClothData.mCurParticles;
+	float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	for(; curIt < curEnd; curIt += 16, prevIt += 16)
+	{
+		curPos[0] = loadAligned(curIt, 0);
+		curPos[1] = loadAligned(curIt, 16);
+		curPos[2] = loadAligned(curIt, 32);
+		curPos[3] = loadAligned(curIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		collideConvexes(planes, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			prevPos[0] = loadAligned(prevIt, 0);
+			prevPos[1] = loadAligned(prevIt, 16);
+			prevPos[2] = loadAligned(prevIt, 32);
+			prevPos[3] = loadAligned(prevIt, 48);
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(curIt, 0, curPos[0]);
+		storeAligned(curIt, 16, curPos[1]);
+		storeAligned(curIt, 32, curPos[2]);
+		storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+
+	mAllocator.deallocate(planes);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const Simd4f* __restrict planes, Simd4f* __restrict curPos,
+                                                 ImpulseAccumulator& accum)
+{
+	Simd4i result = gSimd4iZero;
+	Simd4i mask4 = gSimd4iOne;
+
+	const Simd4f* __restrict pIt, *pEnd = planes + mClothData.mNumPlanes;
+	Simd4f* __restrict dIt = const_cast<Simd4f*>(pEnd);
+	for(pIt = planes; pIt != pEnd; ++pIt, ++dIt)
+	{
+		*dIt = splat<3>(*pIt) + curPos[2] * splat<2>(*pIt) + curPos[1] * splat<1>(*pIt) + curPos[0] * splat<0>(*pIt);
+		result = result | (mask4 & simd4i(*dIt < gSimd4fZero));
+		mask4 = mask4 << 1; // todo: shift by Simd4i on consoles
+	}
+
+	if(allEqual(result, gSimd4iZero))
+		return;
+
+	const uint32_t* __restrict cIt = mClothData.mConvexMasks;
+	const uint32_t* __restrict cEnd = cIt + mClothData.mNumConvexes;
+	for(; cIt != cEnd; ++cIt)
+	{
+		uint32_t mask = *cIt;
+		mask4 = simd4i(int(mask));
+		if(!anyEqual(mask4 & result, mask4, mask4))
+			continue;
+
+		uint32_t test = mask - 1;
+		uint32_t planeIndex = findBitSet(mask & ~test);
+		Simd4f plane = planes[planeIndex];
+		Simd4f planeX = splat<0>(plane);
+		Simd4f planeY = splat<1>(plane);
+		Simd4f planeZ = splat<2>(plane);
+		Simd4f planeD = pEnd[planeIndex];
+		while(mask &= test)
+		{
+			test = mask - 1;
+			planeIndex = findBitSet(mask & ~test);
+			plane = planes[planeIndex];
+			Simd4f dist = pEnd[planeIndex];
+			Simd4f closer = dist > planeD;
+			planeX = select(closer, splat<0>(plane), planeX);
+			planeY = select(closer, splat<1>(plane), planeY);
+			planeZ = select(closer, splat<2>(plane), planeZ);
+			planeD = max(dist, planeD);
+		}
+
+		accum.subtract(planeX, planeY, planeZ, planeD, simd4f(mask4));
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const IterationState<Simd4f>& state)
+{
+	if(!mClothData.mNumCollisionTriangles)
+		return;
+
+	TriangleData* triangles =
+	    static_cast<TriangleData*>(mAllocator.allocate(sizeof(TriangleData) * mClothData.mNumCollisionTriangles));
+
+	UnalignedIterator<Simd4f, 3> targetTriangles(mClothData.mTargetCollisionTriangles);
+
+	// generate triangle collision data
+	if(state.mRemainingIterations != 1)
+	{
+		// interpolate triangles
+		LerpIterator<Simd4f, UnalignedIterator<Simd4f, 3> > triangleIter(mClothData.mStartCollisionTriangles,
+		                                                                 targetTriangles, state.getCurrentAlpha());
+
+		generateTriangles<Simd4f>(triangles, triangleIter, mClothData.mNumCollisionTriangles);
+	}
+	else
+	{
+		// otherwise use the target triangles directly
+		generateTriangles<Simd4f>(triangles, targetTriangles, mClothData.mNumCollisionTriangles);
+	}
+
+	Simd4f positions[4];
+
+	float* __restrict pIt = mClothData.mCurParticles;
+	float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+	for(; pIt < pEnd; pIt += 16)
+	{
+		positions[0] = loadAligned(pIt, 0);
+		positions[1] = loadAligned(pIt, 16);
+		positions[2] = loadAligned(pIt, 32);
+		positions[3] = loadAligned(pIt, 48);
+		transpose(positions[0], positions[1], positions[2], positions[3]);
+
+		ImpulseAccumulator accum;
+		collideTriangles(triangles, positions, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		positions[0] = positions[0] + accum.mDeltaX * invNumCollisions;
+		positions[1] = positions[1] + accum.mDeltaY * invNumCollisions;
+		positions[2] = positions[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(positions[0], positions[1], positions[2], positions[3]);
+		storeAligned(pIt, 0, positions[0]);
+		storeAligned(pIt, 16, positions[1]);
+		storeAligned(pIt, 32, positions[2]);
+		storeAligned(pIt, 48, positions[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+
+	mAllocator.deallocate(triangles);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const TriangleData* __restrict triangles, Simd4f* __restrict curPos,
+                                                  ImpulseAccumulator& accum)
+{
+	Simd4f normalX, normalY, normalZ, normalD;
+	normalX = normalY = normalZ = normalD = gSimd4fZero;
+	Simd4f minSqrLength = gSimd4fFloatMax;
+
+	const TriangleData* __restrict tIt, *tEnd = triangles + mClothData.mNumCollisionTriangles;
+	for(tIt = triangles; tIt != tEnd; ++tIt)
+	{
+		Simd4f base = loadAligned(&tIt->base.x);
+		Simd4f edge0 = loadAligned(&tIt->edge0.x);
+		Simd4f edge1 = loadAligned(&tIt->edge1.x);
+		Simd4f normal = loadAligned(&tIt->normal.x);
+		Simd4f aux = loadAligned(&tIt->det);
+
+		Simd4f dx = curPos[0] - splat<0>(base);
+		Simd4f dy = curPos[1] - splat<1>(base);
+		Simd4f dz = curPos[2] - splat<2>(base);
+
+		Simd4f e0x = splat<0>(edge0);
+		Simd4f e0y = splat<1>(edge0);
+		Simd4f e0z = splat<2>(edge0);
+
+		Simd4f e1x = splat<0>(edge1);
+		Simd4f e1y = splat<1>(edge1);
+		Simd4f e1z = splat<2>(edge1);
+
+		Simd4f nx = splat<0>(normal);
+		Simd4f ny = splat<1>(normal);
+		Simd4f nz = splat<2>(normal);
+
+		Simd4f deltaDotEdge0 = dx * e0x + dy * e0y + dz * e0z;
+		Simd4f deltaDotEdge1 = dx * e1x + dy * e1y + dz * e1z;
+		Simd4f deltaDotNormal = dx * nx + dy * ny + dz * nz;
+
+		Simd4f edge0DotEdge1 = splat<3>(base);
+		Simd4f edge0SqrLength = splat<3>(edge0);
+		Simd4f edge1SqrLength = splat<3>(edge1);
+
+		Simd4f s = edge1SqrLength * deltaDotEdge0 - edge0DotEdge1 * deltaDotEdge1;
+		Simd4f t = edge0SqrLength * deltaDotEdge1 - edge0DotEdge1 * deltaDotEdge0;
+
+		Simd4f sPositive = s > gSimd4fZero;
+		Simd4f tPositive = t > gSimd4fZero;
+
+		Simd4f det = splat<0>(aux);
+
+		s = select(tPositive, s * det, deltaDotEdge0 * splat<2>(aux));
+		t = select(sPositive, t * det, deltaDotEdge1 * splat<3>(aux));
+
+		Simd4f clamp = gSimd4fOne < s + t;
+		Simd4f numerator = edge1SqrLength - edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1;
+
+		s = select(clamp, numerator * splat<1>(aux), s);
+
+		s = max(gSimd4fZero, min(gSimd4fOne, s));
+		t = max(gSimd4fZero, min(gSimd4fOne - s, t));
+
+		dx = dx - e0x * s - e1x * t;
+		dy = dy - e0y * s - e1y * t;
+		dz = dz - e0z * s - e1z * t;
+
+		Simd4f sqrLength = dx * dx + dy * dy + dz * dz;
+
+		// slightly increase distance for colliding triangles
+		Simd4f slack = (gSimd4fZero > deltaDotNormal) & simd4f(1e-4f);
+		sqrLength = sqrLength + sqrLength * slack;
+
+		Simd4f mask = sqrLength < minSqrLength;
+
+		normalX = select(mask, nx, normalX);
+		normalY = select(mask, ny, normalY);
+		normalZ = select(mask, nz, normalZ);
+		normalD = select(mask, deltaDotNormal, normalD);
+
+		minSqrLength = min(sqrLength, minSqrLength);
+	}
+
+	Simd4f mask;
+	if(!anyGreater(gSimd4fZero, normalD, mask))
+		return;
+
+	accum.subtract(normalX, normalY, normalZ, normalD, mask);
+}
+
+// explicit template instantiation
+#if NV_SIMD_SIMD
+template class cloth::SwCollision<Simd4f>;
+#endif
+#if NV_SIMD_SCALAR
+template class cloth::SwCollision<Scalar4f>;
+#endif
+
+/*
+namespace
+{
+    using namespace cloth;
+
+    int test()
+    {
+        Simd4f vertices[] = {
+            simd4f(0.0f, 0.0f, 0.0f, 0.0f),
+            simd4f(0.1f, 0.0f, 0.0f, 0.0f),
+            simd4f(0.0f, 0.1f, 0.0f, 0.0f)
+        };
+        TriangleData triangle;
+        generateTriangles<Simd4f>(&triangle, &*vertices, 1);
+
+        char buffer[1000];
+        SwKernelAllocator alloc(buffer, 1000);
+
+        SwClothData* cloth = static_cast<SwClothData*>(malloc(sizeof(SwClothData)));
+        memset(cloth, 0, sizeof(SwClothData));
+        cloth->mNumTriangles = 1;
+
+        SwCollision<Simd4f> collision(*cloth, alloc);
+        SwCollision<Simd4f>::ImpulseAccumulator accum;
+
+        Simd4f particles[4] = {};
+        for(float y=-0.1f; y < 0.0f; y += 0.2f)
+        {
+            for(float x=-0.1f; x < 0.0f; x += 0.2f)
+            {
+                particles[0] = simd4f(x);
+                particles[1] = simd4f(y);
+                particles[2] = simd4f(-1.0f);
+
+                collision.collideTriangles(&triangle, particles, accum);
+            }
+        }
+
+        return 0;
+    }
+
+    static int blah = test();
+}
+*/
-- 
cgit v1.2.3