Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelDynamics/src
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
61 files changed, 24858 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulation.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulation.cpp
new file mode 100644
index 00000000..347aecb8
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulation.cpp
@@ -0,0 +1,241 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "PsMathUtils.h"
+#include "CmConeLimitHelper.h"
+#include "DySolverConstraint1D.h"
+#include "DyArticulation.h"
+#include "DyArticulationHelper.h"
+#include "PxsRigidBody.h"
+#include "PxcConstraintBlockStream.h"
+#include "DyArticulationContactPrep.h"
+#include "DyDynamics.h"
+#include "DyArticulationReference.h"
+#include "DyArticulationPImpl.h"
+#include <stdio.h>
+
+using namespace physx;
+
+// we encode articulation link handles in the lower bits of the pointer, so the
+// articulation has to be aligned, which in an aligned pool means we need to size it
+// appropriately
+
+namespace physx
+{
+	namespace Dy
+	{
+		void SolverCoreRegisterArticulationFns();
+
+		void SolverCoreRegisterArticulationFnsCoulomb();
+
+
+PX_COMPILE_TIME_ASSERT((sizeof(Articulation)&(DY_ARTICULATION_MAX_SIZE-1))==0);
+
+Articulation::Articulation(Sc::ArticulationSim* sim)
+:	mSolverDesc(NULL), mArticulationSim(sim)
+{
+	PX_ASSERT((reinterpret_cast<size_t>(this) & (DY_ARTICULATION_MAX_SIZE-1))==0);
+}
+
+Articulation::~Articulation()
+{
+}
+
+
+/* computes the implicit impulse and the drive scale at the joint, in joint coords */
+
+PxU32 Articulation::getLinkIndex(ArticulationLinkHandle handle)	const	
+{ 
+	return PxU32(handle&DY_ARTICULATION_IDMASK); 
+}
+
+#if DY_DEBUG_ARTICULATION
+
+void Articulation::computeResiduals(const Cm::SpatialVector *v, 
+									   const ArticulationJointTransforms* jointTransforms,
+									   bool /*dump*/) const
+{
+	typedef ArticulationFnsScalar Fns;
+
+	PxReal error = 0, energy = 0;
+	for(PxU32 i=1;i<mSolverDesc->linkCount;i++)
+	{
+		const ArticulationJointTransforms &b = jointTransforms[i];
+		PxU32 parent = mSolverDesc->links[i].parent;
+		const ArticulationJointCore &j = *mSolverDesc->links[i].inboundJoint;
+		PX_UNUSED(j);
+
+		Cm::SpatialVector residual = Fns::translateMotion(mSolverDesc->poses[i].p - b.cB2w.p, v[i])
+								   - Fns::translateMotion(mSolverDesc->poses[parent].p - b.cB2w.p, v[parent]);
+
+		error += residual.linear.magnitudeSquared();
+		energy += residual.angular.magnitudeSquared();
+
+	}
+//	if(dump)
+		printf("Energy %f, Error %f\n", energy, error);
+}
+
+
+Cm::SpatialVector Articulation::computeMomentum(const FsInertia *inertia) const
+{
+	typedef ArticulationFnsScalar Fns;
+
+	Cm::SpatialVector *velocity = reinterpret_cast<Cm::SpatialVector*>(getVelocity(*mSolverDesc->fsData));
+	Cm::SpatialVector m = Cm::SpatialVector::zero();
+	for(PxU32 i=0;i<mSolverDesc->linkCount;i++)
+		m += Fns::translateForce(mSolverDesc->poses[i].p - mSolverDesc->poses[0].p, ArticulationFnsScalar::multiply(inertia[i], velocity[i]));
+	return m;
+}
+
+
+
+void Articulation::checkLimits() const
+{
+	for(PxU32 i=1;i<mSolverDesc->linkCount;i++)
+	{
+		PxTransform cA2w = mSolverDesc->poses[mSolverDesc->links[i].parent].transform(mSolverDesc->links[i].inboundJoint->parentPose);
+		PxTransform cB2w = mSolverDesc->poses[i].transform(mSolverDesc->links[i].inboundJoint->childPose);
+		
+		PxTransform cB2cA = cA2w.transformInv(cB2w);
+
+		// the relative quat must be the short way round for limits to work...
+
+		if(cB2cA.q.w<0)
+			cB2cA.q	= -cB2cA.q;
+
+		const ArticulationJointCore& j = *mSolverDesc->links[i].inboundJoint;
+		
+		PxQuat swing, twist;
+		if(j.twistLimited || j.swingLimited)
+			Ps::separateSwingTwist(cB2cA.q, swing, twist);
+		
+		if(j.swingLimited)
+		{
+			PxReal swingLimitContactDistance = PxMin(j.swingYLimit, j.swingZLimit)/4;
+
+			Cm::ConeLimitHelper eh(PxTan(j.swingYLimit/4), 
+								   PxTan(j.swingZLimit/4),
+								   PxTan(swingLimitContactDistance/4));
+
+			PxVec3 axis;
+			PxReal error = 0.0f;
+			if(eh.getLimit(swing, axis, error))
+				printf("%u, (%f, %f), %f, (%f, %f, %f), %f\n", i, j.swingYLimit, j.swingZLimit, swingLimitContactDistance, axis.x, axis.y, axis.z, error);
+		}
+
+//		if(j.twistLimited)
+//		{
+//			PxReal tqTwistHigh = PxTan(j.twistLimitHigh/4),
+//				   tqTwistLow  = PxTan(j.twistLimitLow/4),
+//				   twistPad = (tqTwistHigh - tqTwistLow)*0.25f;
+//				   //twistPad = j.twistLimitContactDistance;
+//
+//			PxVec3 axis = jointTransforms[i].cB2w.rotate(PxVec3(1,0,0));
+//			PxReal tqPhi = Ps::tanHalf(twist.x, twist.w);
+//
+//			if(tqPhi < tqTwistLow + twistPad)
+//				constraintData.pushBack(ConstraintData(-axis, -(tqTwistLow - tqPhi)*4));
+//
+//			if(tqPhi > tqTwistHigh - twistPad)
+//				constraintData.pushBack(ConstraintData(axis, (tqTwistHigh - tqPhi)*4));
+//		}
+	}
+	puts("");
+}
+
+#endif
+
+void PxvRegisterArticulations()
+{
+	ArticulationPImpl::sComputeUnconstrainedVelocities = &ArticulationHelper::computeUnconstrainedVelocities;
+	ArticulationPImpl::sUpdateBodies = &ArticulationHelper::updateBodies;
+	ArticulationPImpl::sSaveVelocity = &ArticulationHelper::saveVelocity;
+
+	SolverCoreRegisterArticulationFns();
+	SolverCoreRegisterArticulationFnsCoulomb();
+}
+
+void Articulation::getDataSizes(PxU32 linkCount, PxU32 &solverDataSize, PxU32& totalSize, PxU32& scratchSize)
+{
+	solverDataSize = sizeof(FsData)													// header
+				   + sizeof(Cm::SpatialVectorV)	* linkCount								// velocity
+				   + sizeof(Cm::SpatialVectorV)	* linkCount								// deferredVelocity
+				   + sizeof(Vec3V)				* linkCount								// deferredSZ
+				   + sizeof(PxReal)				* ((linkCount + 15) & 0xFFFFFFF0)		// The maxPenBias values
+				   + sizeof(FsJointVectors)	* linkCount								// joint offsets
+			   	   + sizeof(FsInertia)												// featherstone root inverse inertia
+				   + sizeof(FsRow)			* linkCount;							// featherstone matrix rows
+
+	totalSize = solverDataSize
+			  + sizeof(LtbRow)		 * linkCount			// lagrange matrix rows
+			  + sizeof(Cm::SpatialVectorV) * linkCount			// ref velocity
+			  + sizeof(FsRowAux)	 * linkCount;
+
+	scratchSize = PxU32(sizeof(FsInertia)*linkCount*3
+		        + ((sizeof(ArticulationJointTransforms)+15)&~15) * linkCount
+				+ sizeof(Mat33V) * linkCount
+				+ ((sizeof(ArticulationJointTransforms)+15)&~15) * linkCount);
+}
+
+
+void PxvArticulationDriveCache::initialize(FsData &cache,
+										   PxU16 linkCount,
+										   const ArticulationLink* links,
+										   PxReal compliance,
+										   PxU32 iterations,
+										   char* scratchMemory,
+										   PxU32 scratchMemorySize)
+{
+	ArticulationHelper::initializeDriveCache(cache, linkCount, links, compliance, iterations, scratchMemory, scratchMemorySize);
+}
+
+PxU32	PxvArticulationDriveCache::getLinkCount(const FsData& cache)
+{
+	return cache.linkCount;
+}
+
+void PxvArticulationDriveCache::applyImpulses(const FsData& cache,
+										 	  Cm::SpatialVectorV* Z,
+											  Cm::SpatialVectorV* V)
+{
+	ArticulationHelper::applyImpulses(cache, Z, V);
+}
+
+void	PxvArticulationDriveCache::getImpulseResponse(const FsData& cache, 
+													  PxU32 linkID, 
+													  const Cm::SpatialVectorV& impulse,
+													  Cm::SpatialVectorV& deltaV)
+{
+	ArticulationHelper::getImpulseResponse(cache, linkID, impulse, deltaV);
+}
+
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrep.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrep.cpp
new file mode 100644
index 00000000..2adc84ea
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrep.cpp
@@ -0,0 +1,408 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "DyArticulationContactPrep.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverConstraint1D.h"
+#include "DyArticulationHelper.h"
+#include "PxcNpWorkUnit.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DyCorrelationBuffer.h"
+#include "DySolverConstraintExtShared.h"
+
+using namespace physx::Gu;
+
+namespace physx
+{
+
+namespace Dy
+{
+
+// constraint-gen only, since these use getVelocity methods
+// which aren't valid during the solver phase
+
+PX_INLINE void computeFrictionTangents(const PxVec3& vrel,const PxVec3& unitNormal, PxVec3& t0, PxVec3& t1)
+{
+	PX_ASSERT(PxAbs(unitNormal.magnitude()-1)<1e-3f);
+
+	t0 = vrel - unitNormal * unitNormal.dot(vrel);
+	PxReal ll = t0.magnitudeSquared();
+
+	if (ll > 0.1f)										//can set as low as 0.
+	{
+		t0 *= PxRecipSqrt(ll);
+		t1 = unitNormal.cross(t0);
+	}
+	else
+		Ps::normalToTangents(unitNormal, t0, t1);		//fallback
+}
+
+PxReal SolverExtBody::projectVelocity(const PxVec3& linear, const PxVec3& angular) const
+{
+	if(mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+	{
+		return mBodyData->projectVelocity(linear, angular);
+	}
+	else
+	{
+		PxF32 f;
+		FStore(getVelocity(*mFsData)[mLinkIndex].dot(Cm::SpatialVector(linear, angular)), &f);
+		return f;
+	}
+}
+
+PxVec3 SolverExtBody::getLinVel() const
+{
+	if(mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return mBodyData->linearVelocity;
+	else
+	{
+		PxVec3 result;
+		V3StoreU(getVelocity(*mFsData)[mLinkIndex].linear, result);
+		return result;
+	}
+}
+
+
+PxVec3 SolverExtBody::getAngVel() const
+{
+	if(mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return mBodyData->angularVelocity;
+	else
+	{
+		PxVec3 result;
+		V3StoreU(getVelocity(*mFsData)[mLinkIndex].angular, result);
+		return result;
+	}
+}
+
+Cm::SpatialVector createImpulseResponseVector(const PxVec3& linear, const PxVec3& angular, const SolverExtBody& body)
+{
+	if(body.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+	{
+		return Cm::SpatialVector(linear, body.mBodyData->sqrtInvInertia * angular);
+	}
+	return Cm::SpatialVector(linear, angular);
+}
+
+PxReal getImpulseResponse(const SolverExtBody& b0, const Cm::SpatialVector& impulse0, Cm::SpatialVector& deltaV0, PxReal dom0, PxReal angDom0,
+								 const SolverExtBody& b1, const Cm::SpatialVector& impulse1, Cm::SpatialVector& deltaV1, PxReal dom1, PxReal angDom1,
+								 bool /*allowSelfCollision*/)
+{
+	PxReal response;
+	//	allowSelfCollision = true;
+	// right now self-collision with contacts crashes the solver
+	
+	//KS - knocked this out to save some space on SPU
+	//if(allowSelfCollision && b0.mLinkIndex!=PxSolverConstraintDesc::NO_LINK && b0.mFsData == b1.mFsData)
+	//{
+	//	ArticulationHelper::getImpulseSelfResponse(*b0.mFsData,b0.mLinkIndex, impulse0, deltaV0, 
+	//												  b1.mLinkIndex, impulse1, deltaV1);
+	//	//PxReal response = impulse0.dot(deltaV0*dom0) + impulse1.dot(deltaV1*dom1);
+	//	PX_ASSERT(PxAbs(impulse0.dot(deltaV0*dom0) + impulse1.dot(deltaV1*dom1))>0);
+	//}
+	//else 
+	{
+		
+		if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		{
+			deltaV0.linear = impulse0.linear * b0.mBodyData->invMass * dom0;
+			deltaV0.angular = impulse0.angular * angDom0;
+		}
+		else
+			ArticulationHelper::getImpulseResponse(*b0.mFsData, b0.mLinkIndex, impulse0.scale(dom0, angDom0), deltaV0);
+
+		response = impulse0.dot(deltaV0);
+		if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		{
+			deltaV1.linear = impulse1.linear * b1.mBodyData->invMass * dom1;
+			deltaV1.angular = impulse1.angular * angDom1;
+		}
+		else
+		{
+			ArticulationHelper::getImpulseResponse(*b1.mFsData, b1.mLinkIndex, impulse1.scale(dom1, angDom1), deltaV1);
+			
+		}
+		response += impulse1.dot(deltaV1);
+	}
+
+	return response;
+}
+
+
+	void setupFinalizeExtSolverContacts(
+						    const ContactPoint* buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							const SolverExtBody& b0,
+							const SolverExtBody& b1,
+							const PxReal invDtF32,
+							PxReal bounceThresholdF32,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1,
+							const PxReal restDist,
+							PxU8* frictionDataPtr,
+							PxReal ccdMaxContactDist)	
+{
+	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
+	// therefore did not get written back to the cache) but the patch addresses are dense,
+	// corresponding to valid patches
+
+	/*const bool haveFriction = PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;*/
+
+	const FloatV ccdMaxSeparation = FLoad(ccdMaxContactDist);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	const FloatV zero=FZero();
+
+	//KS - TODO - this should all be done in SIMD to avoid LHS
+	const PxF32 maxPenBias0 = b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b0.mBodyData->penBiasClamp : getMaxPenBias(*b0.mFsData)[b0.mLinkIndex];
+	const PxF32 maxPenBias1 = b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b1.mBodyData->penBiasClamp : getMaxPenBias(*b1.mFsData)[b1.mLinkIndex];
+
+	const FloatV maxPenBias = FLoad(PxMax(maxPenBias0, maxPenBias1));
+
+
+	const PxReal d0 = invMassScale0;
+	const PxReal d1 = invMassScale1;
+
+	const PxReal angD0 = invInertiaScale0;
+	const PxReal angD1 = invInertiaScale1;
+
+	Vec4V staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = V4Zero();
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(d0));
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(d1));
+
+	const FloatV restDistance = FLoad(restDist); 
+
+	PxU32 frictionPatchWritebackAddrIndex = 0;
+	PxU32 contactWritebackCount = 0;
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+	PxU8 flags = 0;
+
+	for(PxU32 i=0;i<c.frictionPatchCount;i++)
+	{
+		PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+		PX_ASSERT(frictionPatch.anchorCount <= 2);  //0==anchorCount is allowed if all the contacts in the manifold have a large offset. 
+
+		const Gu::ContactPoint* contactBase0 = buffer + c.contactPatches[c.correlationListHeads[i]].start;
+		const PxReal combinedRestitution = contactBase0->restitution;
+
+		const PxReal staticFriction = contactBase0->staticFriction;
+		const PxReal dynamicFriction = contactBase0->dynamicFriction;
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(staticFriction));
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(dynamicFriction));
+	
+		SolverContactHeader* PX_RESTRICT header = reinterpret_cast<SolverContactHeader*>(ptr);
+		ptr += sizeof(SolverContactHeader);		
+
+
+		Ps::prefetchLine(ptr + 128);
+		Ps::prefetchLine(ptr + 256);
+		Ps::prefetchLine(ptr + 384);
+		
+		const bool haveFriction = (disableStrongFriction == 0) ;//PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;
+		header->numNormalConstr		= Ps::to8(contactCount);
+		header->numFrictionConstr	= Ps::to8(haveFriction ? frictionPatch.anchorCount*2 : 0);
+	
+		header->type				= Ps::to8(DY_SC_TYPE_EXT_CONTACT);
+
+		header->flags = flags;
+
+		const FloatV restitution = FLoad(combinedRestitution);
+	
+		header->staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W;
+
+		header->angDom0 = angD0;
+		header->angDom1 = angD1;
+	
+		const PxU32 pointStride = sizeof(SolverContactPointExt);
+		const PxU32 frictionStride = sizeof(SolverContactFrictionExt);
+
+		const Vec3V normal = V3LoadU(buffer[c.contactPatches[c.correlationListHeads[i]].start].normal);
+
+		header->normal = normal;
+		
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer + c.contactPatches[patch].start;
+				
+			PxU8* p = ptr;
+			for(PxU32 j=0;j<count;j++)
+			{
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPointExt* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPointExt*>(p);
+				p += pointStride;
+
+				setupExtSolverContact(b0, b1, d0, d1, angD0, angD1, bodyFrame0, bodyFrame1, normal, invDt, invDtp8, restDistance, maxPenBias, restitution,
+					bounceThreshold, contact, *solverContact, ccdMaxSeparation);
+			
+			}
+
+			ptr = p;
+		}
+		contactWritebackCount += contactCount;
+
+		PxF32* forceBuffer = reinterpret_cast<PxF32*>(ptr);
+		PxMemZero(forceBuffer, sizeof(PxF32) * contactCount);
+		ptr += sizeof(PxF32) * ((contactCount + 3) & (~3));
+
+		header->broken = 0;
+
+		if(haveFriction)
+		{
+			//const Vec3V normal = Vec3V_From_PxVec3(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);
+			PxVec3 normalS = buffer[c.contactPatches[c.correlationListHeads[i]].start].normal;
+
+			PxVec3 t0, t1;
+			computeFrictionTangents(b0.getLinVel() - b1.getLinVel(), normalS, t0, t1);
+
+			Vec3V vT0 = V3LoadU(t0);
+			Vec3V vT1 = V3LoadU(t1);
+			
+			//We want to set the writeBack ptr to point to the broken flag of the friction patch.
+			//On spu we have a slight problem here because the friction patch array is 
+			//in local store rather than in main memory. The good news is that the address of the friction 
+			//patch array in main memory is stored in the work unit. These two addresses will be equal 
+			//except on spu where one is local store memory and the other is the effective address in main memory.
+			//Using the value stored in the work unit guarantees that the main memory address is used on all platforms.
+			PxU8* PX_RESTRICT writeback = frictionDataPtr + frictionPatchWritebackAddrIndex*sizeof(FrictionPatch);
+
+			header->frictionBrokenWritebackByte = writeback;			
+
+			for(PxU32 j = 0; j < frictionPatch.anchorCount; j++)
+			{
+				SolverContactFrictionExt* PX_RESTRICT f0 = reinterpret_cast<SolverContactFrictionExt*>(ptr);
+				ptr += frictionStride;
+				SolverContactFrictionExt* PX_RESTRICT f1 = reinterpret_cast<SolverContactFrictionExt*>(ptr);
+				ptr += frictionStride;
+
+				PxVec3 ra = bodyFrame0.q.rotate(frictionPatch.body0Anchors[j]);
+				PxVec3 rb = bodyFrame1.q.rotate(frictionPatch.body1Anchors[j]);
+				PxVec3 error = (ra + bodyFrame0.p) - (rb + bodyFrame1.p);
+
+				{
+					const PxVec3 raXn = ra.cross(t0);
+					const PxVec3 rbXn = rb.cross(t0);
+
+					Cm::SpatialVector deltaV0, deltaV1;
+
+					const Cm::SpatialVector resp0 = createImpulseResponseVector(t0, raXn, b0);
+					const Cm::SpatialVector resp1 = createImpulseResponseVector(-t1, -rbXn, b1);
+					FloatV resp = FLoad(getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+															 b1, resp1, deltaV1, d1, angD1));
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, zero), FMul(p8, FRecip(resp)), zero);
+
+					PxU32 index = c.contactPatches[c.correlationListHeads[i]].start;
+					PxF32 targetVel = buffer[index].targetVel.dot(t0);
+
+					if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel -= b0.projectVelocity(t0, raXn);
+					else if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel += b1.projectVelocity(t0, rbXn);
+
+					f0->normalXYZ_appliedForceW = V4SetW(vT0, zero);
+					f0->raXnXYZ_velMultiplierW = V4SetW(V4LoadA(&resp0.angular.x), velMultiplier);
+					f0->rbXnXYZ_biasW = V4SetW(V4Neg(V4LoadA(&resp1.angular.x)), FLoad(t0.dot(error) * invDtF32));
+					f0->linDeltaVA = V3LoadA(deltaV0.linear);
+					f0->angDeltaVA = V3LoadA(deltaV0.angular);
+					f0->linDeltaVB = V3LoadA(deltaV1.linear);
+					f0->angDeltaVB = V3LoadA(deltaV1.angular);
+					f0->targetVel = targetVel;
+				}
+
+				{
+
+					const PxVec3 raXn = ra.cross(t1);
+					const PxVec3 rbXn = rb.cross(t1);
+
+					Cm::SpatialVector deltaV0, deltaV1;
+
+
+					const Cm::SpatialVector resp0 = createImpulseResponseVector(t1, raXn, b0);
+					const Cm::SpatialVector resp1 = createImpulseResponseVector(-t1, -rbXn, b1);
+
+					FloatV resp = FLoad(getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+														   b1, resp1, deltaV1, d1, angD1));
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, zero), FMul(p8, FRecip(resp)), zero);
+
+					PxU32 index = c.contactPatches[c.correlationListHeads[i]].start;
+					PxF32 targetVel = buffer[index].targetVel.dot(t0);
+
+					if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel -= b0.projectVelocity(t1, raXn);
+					else if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel += b1.projectVelocity(t1, rbXn);
+
+					f1->normalXYZ_appliedForceW = V4SetW(vT1, zero);
+					f1->raXnXYZ_velMultiplierW = V4SetW(V4LoadA(&resp0.angular.x), velMultiplier);
+					f1->rbXnXYZ_biasW = V4SetW(V4Neg(V4LoadA(&resp1.angular.x)), FLoad(t1.dot(error) * invDtF32));
+					f1->linDeltaVA = V3LoadA(deltaV0.linear);
+					f1->angDeltaVA = V3LoadA(deltaV0.angular);
+					f1->linDeltaVB = V3LoadA(deltaV1.linear);
+					f1->angDeltaVB = V3LoadA(deltaV1.angular);
+					f1->targetVel = targetVel;
+				}
+			}
+		}
+
+		frictionPatchWritebackAddrIndex++;
+	}
+}
+
+}
+
+
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrep.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrep.h
new file mode 100644
index 00000000..4e927b10
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrep.h
@@ -0,0 +1,95 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+   
+
+#ifndef DY_SOLVERCONSTRAINTEXT_H
+#define DY_SOLVERCONSTRAINTEXT_H
+
+#include "DySolverExt.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+
+
+namespace Gu
+{
+	class ContactBuffer;
+	struct ContactPoint;
+}
+
+namespace Dy
+{
+	
+struct CorrelationBuffer;
+
+PxReal getImpulseResponse(const SolverExtBody& b0, const Cm::SpatialVector& impulse0, Cm::SpatialVector& deltaV0, PxReal dom0, PxReal angDom0,
+						  const SolverExtBody& b1, const Cm::SpatialVector& impulse1, Cm::SpatialVector& deltaV1, PxReal dom1, PxReal angDom1,
+						  bool allowSelfCollision = false);
+
+Cm::SpatialVector createImpulseResponseVector(const PxVec3& linear, const PxVec3& angular, const SolverExtBody& body);
+
+void setupFinalizeExtSolverContacts(
+							const Gu::ContactPoint* buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							const SolverExtBody& b0,
+							const SolverExtBody& b1,
+							const PxReal invDtF32,
+							PxReal bounceThresholdF32,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1,
+							PxReal restDistance, PxU8* frictionDataPtr,
+							PxReal ccdMaxContactDist);
+
+
+bool setupFinalizeExtSolverContactsCoulomb(
+							const Gu::ContactBuffer& buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							PxReal invDt,
+							PxReal bounceThreshold,
+							const SolverExtBody& b0,
+							const SolverExtBody& b1,
+							PxU32 frictionCountPerPoint,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1,
+							PxReal restDist,
+							PxReal ccdMaxContactDist);
+
+}
+
+}
+
+#endif //DY_SOLVERCONSTRAINTEXT_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrepPF.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrepPF.cpp
new file mode 100644
index 00000000..8c954b71
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationContactPrepPF.cpp
@@ -0,0 +1,305 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "DyArticulationContactPrep.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DyArticulationHelper.h"
+#include "PxcNpWorkUnit.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DyCorrelationBuffer.h"
+#include "DySolverConstraintExtShared.h"
+
+using namespace physx;
+using namespace Gu;
+
+// constraint-gen only, since these use getVelocityFast methods
+// which aren't valid during the solver phase
+
+namespace physx
+{
+
+namespace Dy
+{
+
+
+bool setupFinalizeExtSolverContactsCoulomb(
+						    const ContactBuffer& buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							PxReal invDt,
+							PxReal bounceThresholdF32,
+							const SolverExtBody& b0,
+							const SolverExtBody& b1,
+							PxU32 frictionCountPerPoint,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1,
+							PxReal restDist,
+							PxReal ccdMaxDistance)	
+{
+	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
+	// therefore did not get written back to the cache) but the patch addresses are dense,
+	// corresponding to valid patches
+
+	const FloatV ccdMaxSeparation = FLoad(ccdMaxDistance);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	//KS - TODO - this should all be done in SIMD to avoid LHS
+	const PxF32 maxPenBias0 = b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b0.mBodyData->penBiasClamp : getMaxPenBias(*b0.mFsData)[b0.mLinkIndex];
+	const PxF32 maxPenBias1 = b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b1.mBodyData->penBiasClamp : getMaxPenBias(*b1.mFsData)[b1.mLinkIndex];
+
+	const FloatV maxPenBias = FLoad(PxMax(maxPenBias0, maxPenBias1)/invDt);
+
+	const FloatV restDistance = FLoad(restDist); 
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+
+	const FloatV invDtV = FLoad(invDt);
+	const FloatV pt8 = FLoad(0.8f);
+
+	const FloatV invDtp8 = FMul(invDtV, pt8);
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	const PxU32 frictionPatchCount = c.frictionPatchCount;
+
+	const PxU32 pointStride = sizeof(SolverContactPointExt);
+	const PxU32 frictionStride = sizeof(SolverContactFrictionExt);
+	const PxU8 pointHeaderType = DY_SC_TYPE_EXT_CONTACT;
+	const PxU8 frictionHeaderType = DY_SC_TYPE_EXT_FRICTION;
+
+	PxReal d0 = invMassScale0;
+	PxReal d1 = invMassScale1;
+	PxReal angD0 = invInertiaScale0;
+	PxReal angD1 = invInertiaScale1;
+
+	PxU8 flags = 0;
+
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		const Vec3V normalV = Ps::aos::V3LoadA(contactBase0->normal);
+		const Vec3V normal = V3LoadA(contactBase0->normal);
+
+		const PxReal combinedRestitution = contactBase0->restitution;
+	
+		
+		SolverContactCoulombHeader* PX_RESTRICT header = reinterpret_cast<SolverContactCoulombHeader*>(ptr);
+		ptr += sizeof(SolverContactCoulombHeader);
+
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+		const FloatV restitution = FLoad(combinedRestitution);
+
+
+		header->numNormalConstr		= PxU8(contactCount);
+		header->type				= pointHeaderType;
+		//header->setRestitution(combinedRestitution);
+
+		header->setDominance0(d0);
+		header->setDominance1(d1);
+		header->angDom0 = angD0;
+		header->angDom1 = angD1;
+		header->flags = flags;
+		
+		header->setNormal(normalV);
+		
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer.contacts + c.contactPatches[patch].start;
+				
+			PxU8* p = ptr;
+			for(PxU32 j=0;j<count;j++)
+			{
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPointExt* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPointExt*>(p);
+				p += pointStride;
+
+				setupExtSolverContact(b0, b1, d0, d1, angD0, angD1, bodyFrame0, bodyFrame1, normal, invDtV, invDtp8, restDistance, maxPenBias, restitution,
+					bounceThreshold, contact, *solverContact, ccdMaxSeparation);
+			}			
+			ptr = p;
+		}
+	}
+
+	//construct all the frictions
+
+	PxU8* PX_RESTRICT ptr2 = workspace;
+
+	const PxF32 orthoThreshold = 0.70710678f;
+	const PxF32 eps = 0.00001f;
+	bool hasFriction = false;
+
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		SolverContactCoulombHeader* header = reinterpret_cast<SolverContactCoulombHeader*>(ptr2); 
+		header->frictionOffset = PxU16(ptr - ptr2);
+		ptr2 += sizeof(SolverContactCoulombHeader) + header->numNormalConstr * pointStride;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		PxVec3 normal = contactBase0->normal;
+
+		const PxReal staticFriction = contactBase0->staticFriction;
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		const bool haveFriction = (disableStrongFriction == 0);
+	
+		SolverFrictionHeader* frictionHeader = reinterpret_cast<SolverFrictionHeader*>(ptr);
+		frictionHeader->numNormalConstr = Ps::to8(c.frictionPatchContactCounts[i]);
+		frictionHeader->numFrictionConstr = Ps::to8(haveFriction ? c.frictionPatchContactCounts[i] * frictionCountPerPoint : 0);
+		frictionHeader->flags = flags;
+		ptr += sizeof(SolverFrictionHeader);
+		PxF32* forceBuffer = reinterpret_cast<PxF32*>(ptr);
+		ptr += frictionHeader->getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+		PxMemZero(forceBuffer, sizeof(PxF32) * c.frictionPatchContactCounts[i]);
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+
+		const PxVec3 t0Fallback1(0.f, -normal.z, normal.y);
+		const PxVec3 t0Fallback2(-normal.y, normal.x, 0.f) ;
+		const PxVec3 tFallback1 = orthoThreshold > PxAbs(normal.x) ? t0Fallback1 : t0Fallback2;
+		const PxVec3 vrel = b0.getLinVel() - b1.getLinVel();
+		const PxVec3 t0_ = vrel - normal * (normal.dot(vrel));
+		const PxReal sqDist = t0_.dot(t0_);
+		const PxVec3 tDir0 = (sqDist > eps ? t0_: tFallback1).getNormalized();
+		const PxVec3 tDir1 = tDir0.cross(normal);
+		PxVec3 tFallback[2] = {tDir0, tDir1};
+
+		PxU32 ind = 0;
+
+		if(haveFriction)
+		{
+			hasFriction = true;
+			frictionHeader->setStaticFriction(staticFriction);
+			frictionHeader->invMass0D0 = d0;
+			frictionHeader->invMass1D1 = d1;
+			frictionHeader->angDom0 = angD0;
+			frictionHeader->angDom1 = angD1;
+			frictionHeader->type			= frictionHeaderType;
+			
+			PxU32 totalPatchContactCount = 0;
+		
+			for(PxU32 patch=c.correlationListHeads[i]; 
+				patch!=CorrelationBuffer::LIST_END; 
+				patch = c.contactPatches[patch].next)
+			{
+				const PxU32 count = c.contactPatches[patch].count;
+				const PxU32 start = c.contactPatches[patch].start;
+				const Gu::ContactPoint* contactBase = buffer.contacts + start;
+					
+				PxU8* p = ptr;
+
+				for(PxU32 j =0; j < count; j++)
+				{
+					const Gu::ContactPoint& contact = contactBase[j];
+					const PxVec3 ra = contact.point - bodyFrame0.p;
+					const PxVec3 rb = contact.point - bodyFrame1.p;
+						
+					const PxVec3 targetVel = contact.targetVel;
+					const PxVec3 pVRa = b0.getLinVel() + b0.getAngVel().cross(ra);
+					const PxVec3 pVRb = b1.getLinVel() + b1.getAngVel().cross(rb);
+					//const PxVec3 vrel = pVRa - pVRb;
+
+					for(PxU32 k = 0; k < frictionCountPerPoint; ++k)
+					{
+						SolverContactFrictionExt* PX_RESTRICT f0 = reinterpret_cast<SolverContactFrictionExt*>(p);
+						p += frictionStride;
+
+						PxVec3 t0 = tFallback[ind];
+						ind = 1 - ind;
+						PxVec3 raXn = ra.cross(t0); 
+						PxVec3 rbXn = rb.cross(t0); 
+						Cm::SpatialVector deltaV0, deltaV1;
+
+						const Cm::SpatialVector resp0 = createImpulseResponseVector(t0, raXn, b0);
+						const Cm::SpatialVector resp1 = createImpulseResponseVector(-t0, -rbXn, b1);
+
+						PxReal unitResponse = getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+																 b1, resp1, deltaV1, d1, angD1);
+
+						PxReal tv = targetVel.dot(t0);
+						if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+							tv += pVRa.dot(t0);
+						else if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+							tv -= pVRb.dot(t0);
+
+
+						f0->setVelMultiplier(FLoad(unitResponse>0.0f ? 1.f/unitResponse : 0.0f));
+						f0->setRaXn(resp0.angular);
+						f0->setRbXn(-resp1.angular);
+						f0->targetVel = tv;
+						f0->setNormal(t0);
+						f0->setAppliedForce(0.0f);
+						f0->linDeltaVA = V3LoadA(deltaV0.linear);
+						f0->angDeltaVA = V3LoadA(deltaV0.angular);
+						f0->linDeltaVB = V3LoadA(deltaV1.linear);
+						f0->angDeltaVB = V3LoadA(deltaV1.angular);
+					}					
+				}
+
+				totalPatchContactCount += c.contactPatches[patch].count;
+				
+				ptr = p;	
+			}
+		}
+	}
+	//PX_ASSERT(ptr - workspace == n.solverConstraintSize);
+	return hasFriction;
+}
+
+
+}
+
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsDebug.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsDebug.h
new file mode 100644
index 00000000..901eef93
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsDebug.h
@@ -0,0 +1,262 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_DEBUG_FNS_H
+#define DY_ARTICULATION_DEBUG_FNS_H
+
+#include "DyArticulationFnsScalar.h"
+#include "DyArticulationFnsSimd.h"
+
+namespace physx
+{
+namespace Dy
+{
+#if 0
+	void printMomentum(const char* id, PxTransform* pose, Cm::SpatialVector* velocity, FsInertia* inertia, PxU32 linkCount)
+	{
+		typedef ArticulationFnsScalar Fns;
+
+		Cm::SpatialVector m = Cm::SpatialVector::zero();
+		for(PxU32 i=0;i<linkCount;i++)
+			m += Fns::translateForce(pose[i].p - pose[0].p, Fns::multiply(inertia[i], velocity[i]));
+		printf("momentum (%20s): (%f, %f, %f), (%f, %f, %f)\n", id, m.linear.x, m.linear.y, m.linear.z, m.angular.x, m.angular.y, m.angular.z);
+	}
+#endif
+
+class ArticulationFnsDebug
+{
+	typedef ArticulationFnsSimdBase SimdBase;
+	typedef ArticulationFnsSimd<ArticulationFnsDebug> Simd;
+	typedef ArticulationFnsScalar Scalar;
+
+public:
+
+	static PX_FORCE_INLINE	FsInertia addInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Add(in1.ll, in2.ll),
+							M33Add(in1.la, in2.la),
+							M33Add(in1.aa, in2.aa));
+	}
+
+	static PX_FORCE_INLINE	FsInertia subtractInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Sub(in1.ll, in2.ll),
+							M33Sub(in1.la, in2.la),
+							M33Sub(in1.aa, in2.aa));
+	}
+
+	static Mat33V invertSym33(const Mat33V &m)
+	{
+		PxMat33 n_ = Scalar::invertSym33(unsimdify(m));
+		Mat33V n = SimdBase::invertSym33(m);
+		compare33(n_, unsimdify(n));
+
+		return n;
+	}
+
+	static Mat33V invSqrt(const Mat33V &m)
+	{
+		PxMat33 n_ = Scalar::invSqrt(unsimdify(m));
+		Mat33V n = SimdBase::invSqrt(m);
+		compare33(n_, unsimdify(n));
+
+		return n;
+	}
+
+
+
+	static FsInertia invertInertia(const FsInertia &I)
+	{
+		SpInertia J_ = Scalar::invertInertia(unsimdify(I));
+		FsInertia J = SimdBase::invertInertia(I);
+		compareInertias(J_,unsimdify(J));
+
+		return J;
+	}
+
+	static Mat33V computeSIS(const FsInertia &I, const Cm::SpatialVectorV S[3], Cm::SpatialVectorV*PX_RESTRICT IS)
+	{
+		Cm::SpatialVector IS_[3];
+		Scalar::multiply(IS_, unsimdify(I), unsimdify(&S[0]));
+		PxMat33 D_ = Scalar::multiplySym(IS_, unsimdify(&S[0]));
+
+		Mat33V D = SimdBase::computeSIS(I, S, IS);
+
+		compare33(unsimdify(D), D_);
+
+		return D;
+	}
+
+
+	static FsInertia multiplySubtract(const FsInertia &I, const Mat33V &D, const Cm::SpatialVectorV IS[3], Cm::SpatialVectorV*PX_RESTRICT DSI)
+	{
+		Cm::SpatialVector DSI_[3];
+
+		Scalar::multiply(DSI_, unsimdify(IS), unsimdify(D));
+		SpInertia J_ = Scalar::multiplySubtract(unsimdify(I), DSI_, unsimdify(IS));
+
+		FsInertia J = SimdBase::multiplySubtract(I, D, IS, DSI);
+
+		compareInertias(unsimdify(J), J_);
+
+		return J;
+	} 
+
+
+	static FsInertia multiplySubtract(const FsInertia &I, const Cm::SpatialVectorV S[3])
+	{
+		SpInertia J_ = Scalar::multiplySubtract(unsimdify(I), unsimdify(S), unsimdify(S));
+		FsInertia J = SimdBase::multiplySubtract(I, S);
+		compareInertias(unsimdify(J), J_);
+		return J;
+	} 
+
+
+	static FsInertia translateInertia(Vec3V offset, const FsInertia &I)
+	{
+		PxVec3 offset_;
+		V3StoreU(offset, offset_);
+		SpInertia J_ = Scalar::translate(offset_, unsimdify(I));
+		FsInertia J = SimdBase::translateInertia(offset, I);
+		compareInertias(J_, unsimdify(J));
+
+		return J;
+	}
+
+
+	static PX_FORCE_INLINE FsInertia propagate(const FsInertia &I,
+												  const Cm::SpatialVectorV S[3],
+												  const Mat33V &load,
+												  const FloatV isf)
+	{
+		SpInertia J_ = Scalar::propagate(unsimdify(I), unsimdify(&S[0]), unsimdify(load), unsimdify(isf));
+		FsInertia J = Simd::propagate(I, S, load, isf);
+
+		compareInertias(J_, unsimdify(J));
+		return J;
+	}
+
+
+	static PX_FORCE_INLINE Mat33V computeDriveInertia(const FsInertia &I0, 
+													  const	FsInertia &I1, 
+													  const Cm::SpatialVectorV S[3])
+	{
+		PxMat33 m_ = Scalar::computeDriveInertia(unsimdify(I0), unsimdify(I1), unsimdify(&S[0]));
+		Mat33V m = Simd::computeDriveInertia(I0, I1, S);
+
+		compare33(m_, unsimdify(m));
+		return m;
+	}
+
+	static const PxMat33 unsimdify(const Mat33V &m)
+	{
+		PX_ALIGN(16, PxMat33) m_;
+		PxMat33_From_Mat33V(m, m_);
+		return m_;
+	}
+
+	static PxReal unsimdify(const FloatV &m)
+	{
+		PxF32 f;
+		FStore(m, &f);
+		return f;
+	}
+
+	static SpInertia unsimdify(const FsInertia &I)
+	{
+		return SpInertia (unsimdify(I.ll),
+						  unsimdify(I.la),
+						  unsimdify(I.aa));
+	}
+
+	static const Cm::SpatialVector* unsimdify(const Cm::SpatialVectorV *S)
+	{
+		return reinterpret_cast<const Cm::SpatialVector*>(S);
+	}
+
+
+private:
+
+	static PxReal absmax(const PxVec3& n)
+	{
+		return PxMax(PxAbs(n.x), PxMax(PxAbs(n.y),PxAbs(n.z)));
+	}
+
+	static PxReal norm(const PxMat33& n)
+	{
+		return PxMax(absmax(n.column0), PxMax(absmax(n.column1), absmax(n.column2)));
+	}
+
+	static void compare33(const PxMat33& ref, const PxMat33& n)
+	{
+		PxReal errNorm = norm(ref-n);	
+		PX_UNUSED(errNorm);
+		PX_ASSERT(errNorm <= PxMax(norm(ref)*1e-3f, 1e-4f));
+	}
+
+	static void compareInertias(const SpInertia& a, const SpInertia& b)
+	{
+		compare33(a.mLL, b.mLL);
+		compare33(a.mLA, b.mLA);
+		compare33(a.mAA, b.mAA);
+	}
+
+
+};
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+static bool isPositiveDefinite(const Mat33V& m)
+{
+	PX_ALIGN_PREFIX(16) PxMat33 m1 PX_ALIGN_SUFFIX(16);
+	PxMat33_From_Mat33V(m, m1);
+	return isPositiveDefinite(m1);
+}
+
+
+static bool isPositiveDefinite(const FsInertia& s)
+{
+	return isPositiveDefinite(ArticulationFnsDebug::unsimdify(s));
+}
+
+static PxReal magnitude(const Cm::SpatialVectorV &v)
+{
+	return PxSqrt(FStore(V3Dot(v.linear, v.linear)) + FStore(V3Dot(v.angular, v.angular)));
+}
+
+static bool almostEqual(const Cm::SpatialVectorV &ref, const Cm::SpatialVectorV& test, PxReal tolerance)
+{
+	return magnitude(ref-test)<=tolerance*magnitude(ref);
+}
+#endif
+}
+}
+
+#endif //DY_ARTICULATION_DEBUG_FNS_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsScalar.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsScalar.h
new file mode 100644
index 00000000..1efb2708
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsScalar.h
@@ -0,0 +1,397 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_SCALAR_FNS_H
+#define DY_ARTICULATION_SCALAR_FNS_H
+
+// Scalar helpers for articulations
+
+#include "DyArticulationUtils.h"
+#include "DyArticulationScalar.h"
+#include "DySpatial.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+/*
+namespace
+{
+	static void print(const PxMat33 &m)
+	{
+		printf("(%f, %f, %f)\n(%f, %f, %f)\n(%f, %f, %f)\n\n",
+			m[0][0], m[0][1], m[0][2], m[1][0], m[1][1], m[1][2], m[2][0], m[2][1], m[2][2]);
+	}
+
+	static void print(const Cm::SpatialVector *v, PxU32 count)
+	{
+		for(PxU32 i=0;i<count;i++)
+		{
+			printf("(%f, %f, %f), (%f, %f, %f)\n", 
+				v[i].linear.x, v[i].linear.y, v[i].linear.z,
+				v[i].angular.x, v[i].angular.y, v[i].angular.z);
+		}
+	}
+}
+*/
+
+class ArticulationDiagnostics
+{
+public:
+static bool cholesky(const PxMat33& in, PxMat33& out)
+{
+	out = in;
+
+	if(out[0][0]<=0)
+		return false;
+
+	out[0] /= PxSqrt(out[0][0]);
+	out[1] -= out[0][1]*out[0];
+	out[2] -= out[0][2]*out[0];
+
+	if(out[1][1]<=0)
+		return false;
+
+	out[1] /= PxSqrt(out[1][1]);
+
+	out[2] -= out[1][2]*out[1];
+	if(out[2][2]<=0)
+		return false;
+	out[2] /= PxSqrt(out[2][2]);
+
+	out[1][0] = out[2][0] = out[2][1] = 0;
+	return true;
+}
+
+static bool isSymmetric(const PxMat33&a)
+{
+	return a[0][1] == a[1][0] && a[0][2] == a[2][0] && a[1][2] == a[2][1];
+}
+
+static bool isSymmetric(const Mat33V&a)
+{
+	PxMat33 m;
+	PxMat33_From_Mat33V(a,m);
+	return isSymmetric(m);
+}
+
+static bool isSymmetric(const SpInertia&a)
+{
+	return isSymmetric(a.mLL) && isSymmetric(a.mAA);
+}
+
+
+static bool isPositiveDefinite(const PxMat33& m)
+{
+	PxMat33 _;
+	return cholesky(m, _);
+}
+
+
+static bool isPositiveDefinite(const SpInertia &s)
+{
+	// compute
+	// (a 0)
+	// (b c)
+
+	PxMat33 a;
+	if(!cholesky(s.mLL, a))
+		return false;
+
+	PxMat33 bt = a.getInverse() * s.mLA;
+	PxMat33 x = s.mAA - bt.getTranspose()*bt;
+	PxMat33 c;
+	return cholesky(x, c);
+}
+
+};
+
+class ArticulationFnsScalar
+{
+public:
+
+	static PX_FORCE_INLINE Cm::SpatialVector translateMotion(const PxVec3& p, const Cm::SpatialVector& v)
+	{
+		return Cm::SpatialVector(v.linear + p.cross(v.angular), v.angular);
+	}
+
+	// translate a force resolved at position p to the origin
+
+	static PX_FORCE_INLINE Cm::SpatialVector translateForce(const PxVec3& p, const Cm::SpatialVector& v)
+	{
+		return Cm::SpatialVector(v.linear, v.angular + p.cross(v.linear));
+	}
+
+	static PX_FORCE_INLINE PxMat33 invertSym33(const PxMat33& in)
+	{
+		PxVec3 v0 = in[1].cross(in[2]),
+			   v1 = in[2].cross(in[0]),
+			   v2 = in[0].cross(in[1]);
+
+		PxReal det = v0.dot(in[0]);
+
+
+		PX_ASSERT(det!=0);
+		PxReal recipDet = 1.0f/det;
+
+		return PxMat33(v0 * recipDet,
+					   PxVec3(v0.y, v1.y, v1.z) * recipDet,
+					   PxVec3(v0.z, v1.z, v2.z) * recipDet);
+	}
+
+	static PX_FORCE_INLINE SpInertia multiplySubtract(const SpInertia& I, const Cm::SpatialVector in0[3], const Cm::SpatialVector in1[3])
+	{
+		return I - SpInertia::dyad(in0[0], in1[0])
+				 - SpInertia::dyad(in0[1], in1[1])
+				 - SpInertia::dyad(in0[2], in1[2]);
+	}
+
+	static PX_FORCE_INLINE PxMat33 multiplySym(const Cm::SpatialVector* IS, const Cm::SpatialVector* S)
+	{
+	//		return PxMat33(axisDot(IS, S[0]), axisDot(IS, S[1]), axisDot(IS, S[2]));
+
+		PxReal a00 = IS[0].dot(S[0]), a01 = IS[0].dot(S[1]), a02 = IS[0].dot(S[2]),
+									  a11 = IS[1].dot(S[1]), a12 = IS[1].dot(S[2]),
+															 a22 = IS[2].dot(S[2]);
+
+		return PxMat33(PxVec3(a00, a01, a02),
+					   PxVec3(a01, a11, a12),
+					   PxVec3(a02, a12, a22));
+	}
+
+	static PX_FORCE_INLINE void multiply(Cm::SpatialVector out[3], const SpInertia& I, const Cm::SpatialVector in[3])
+	{
+		out[0] = I * in[0];
+		out[1] = I * in[1];
+		out[2] = I * in[2];
+	}
+
+	static PX_FORCE_INLINE void multiply(Cm::SpatialVector out[3], const Cm::SpatialVector in[3], const PxMat33& D)
+	{
+		out[0] = axisMultiply(in, D[0]);
+		out[1] = axisMultiply(in, D[1]);
+		out[2] = axisMultiply(in, D[2]);
+	}
+
+	static PxMat33 invSqrt(const PxMat33 &m)
+	{
+		// cholesky factor to 
+		// (a 0 0)
+		// (b c 0)
+		// (d e f)
+		// except that a,c,f are the reciprocal sqrts rather than sqrts
+
+		PxVec3 v0 = m.column0, v1 = m.column1, v2 = m.column2;
+
+		PxReal a = PxRecipSqrt(v0.x);
+		PxReal b = v0.y*a;
+		PxReal c = PxRecipSqrt(v1.y - b*b);
+		PxReal d = v0.z*a;
+		PxReal e = (v1.z-d*b) * c;
+		PxReal f = PxRecipSqrt(v2.z - d*d - e*e);
+
+		// invert 
+		PxReal x = -b*a*c, y = (-e*x-d*a)*f, z = -e*c*f;
+
+		PxMat33 r(PxVec3(a, 0,  0 ),
+				  PxVec3(x,  c, 0 ),
+				  PxVec3(y,  z,  f));
+
+		return r;
+	}
+
+
+	static PX_FORCE_INLINE PxMat33 computeSIS(const Cm::SpatialVector S[3], const SpInertia& I)
+	{
+		Cm::SpatialVector IS[3];
+		multiply(IS, I, S);
+		return multiplySym(IS, S);
+	}
+
+	// translate from COM-centered world-aligned inertia matrix to a displaced frame
+	static PX_INLINE SpInertia translate(const PxVec3& p, const SpInertia& i)
+	{
+		PxMat33 S = Ps::star(p), ST = S.getTranspose();
+		PxMat33 sla = S * i.mLA, llst = i.mLL * ST;
+//		return SpInertia(i.mLL, i.mLA + llst, i.mAA + sla + sla.getTranspose() + S * llst);
+
+		// this yields a symmetric result
+		PxMat33 t = sla+S*llst*0.5f;
+		return SpInertia(i.mLL, i.mLA + llst, i.mAA + (t+t.getTranspose()));	}
+
+	static PX_FORCE_INLINE Cm::SpatialVector axisMultiply(const Cm::SpatialVector* a, const PxVec3& v)
+	{
+		return a[0]*v[0]+a[1]*v[1]+a[2]*v[2];
+	}
+
+	static PX_FORCE_INLINE PxVec3 axisDot(const Cm::SpatialVector* a, const Cm::SpatialVector& v)
+	{
+		return PxVec3(a[0].dot(v), a[1].dot(v), a[2].dot(v));
+	}
+
+	static PX_FORCE_INLINE SpInertia invertInertia(const SpInertia& I)
+	{
+		PxMat33 aa = I.mAA, ll = I.mLL, la = I.mLA;
+
+		aa = (aa + aa.getTranspose())*0.5f;
+		ll = (ll + ll.getTranspose())*0.5f;
+
+		PxMat33 AAInv = invertSym33(aa);
+
+		PxMat33 z = -la * AAInv;
+		PxMat33 S = ll + z * la.getTranspose();	// Schur complement of mAA
+		
+		PxMat33 LL = invertSym33(S);
+
+		PxMat33 LA = LL * z;
+		PxMat33 AA = AAInv + z.getTranspose() * LA;
+
+		SpInertia result(LL, LA, AA);
+
+		return result;
+	}
+
+	static SpInertia propagate(const SpInertia& I,
+							   const Cm::SpatialVector S[3],
+							   const PxMat33& load,
+							   PxReal isf)
+	{
+		Cm::SpatialVector IS[3], ISD[3];
+		multiply(IS, I, S);
+		
+		PxMat33 SIS = multiplySym(S, IS);
+
+		// yields a symmetric result
+		PxMat33 D = invSqrt(SIS+load*isf);
+		multiply(ISD, IS, D);
+		return multiplySubtract(I, ISD, ISD);
+	}
+
+	static PxMat33 computeDriveInertia(const SpInertia& I0, 
+									   const SpInertia& I1, 
+									   const Cm::SpatialVector S[3])
+	{
+		// this could be a lot more efficient, especially since it can be combined with
+		// the inertia accumulation. Also it turns out to be symmetric in I0 and I1, which
+		// isn't obvious from the formulation, so it's likely there's a more efficient formulation
+
+		PxMat33 D = invertSym33(computeSIS(S,I0));
+		Cm::SpatialVector IS[3], ISD[3];
+
+		multiply(IS,I0,S);
+		multiply(ISD, IS, D);
+
+		SpInertia tot = multiplySubtract(I0+I1,ISD,IS);
+		SpInertia invTot = invertInertia(tot);
+
+		PxMat33 E = computeSIS(ISD,invTot);
+
+		PxMat33 load = invertSym33(E+D);
+
+		PX_ASSERT(load[0].isFinite() && load[1].isFinite() && load[2].isFinite());
+		PX_ASSERT(ArticulationDiagnostics::isSymmetric(load) && ArticulationDiagnostics::isPositiveDefinite(load));
+		return load;
+	}
+
+	static PX_INLINE Cm::SpatialVector propagateImpulse(const FsRow& row, 
+														const FsJointVectors& jv,
+													    PxVec3& SZ,
+													    const Cm::SpatialVector& Z,
+													    const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+		SZ = Z.angular + Z.linear.cross(getJointOffset(jv));
+		Cm::SpatialVector result = translateForce(getParentOffset(jv), Z - axisMultiply(getDSI(row), SZ));
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	PxVec3 SZcheck;
+	Cm::SpatialVector check = ArticulationRef::propagateImpulse(row, jv, SZcheck, Z, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+	PX_ASSERT((SZ-SZcheck).magnitude()<1e-5*PxMax(SZcheck.magnitude(), 1.0f));
+#endif
+		return result;
+	}
+
+	static PX_INLINE Cm::SpatialVector propagateVelocity(const FsRow& row, 
+														 const FsJointVectors& jv,
+														 const PxVec3& SZ, 
+														 const Cm::SpatialVector& v,
+														 const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+
+		Cm::SpatialVector w = translateMotion(-getParentOffset(jv), v);
+		PxVec3 DSZ = multiply(row.D, SZ);
+
+		PxVec3 n = axisDot(getDSI(row), w) + DSZ;
+		Cm::SpatialVector result = w - Cm::SpatialVector(getJointOffset(jv).cross(n),n);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector check = ArticulationRef::propagateVelocity(row, jv, SZ, v, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+#endif
+		return result;
+	}
+
+
+	static PX_FORCE_INLINE PxVec3 multiply(const Mat33V& m, const PxVec3& v)
+	{
+		return reinterpret_cast<const PxVec3&>(m.col0) * v.x 
+			 + reinterpret_cast<const PxVec3&>(m.col1) * v.y 
+			 + reinterpret_cast<const PxVec3&>(m.col2) * v.z;
+	}
+
+	static PX_FORCE_INLINE PxVec3 multiplyTranspose(const Mat33V& m, const PxVec3& v)
+	{
+		return PxVec3(v.dot(reinterpret_cast<const PxVec3&>(m.col0)), 
+					  v.dot(reinterpret_cast<const PxVec3&>(m.col1)), 
+					  v.dot(reinterpret_cast<const PxVec3&>(m.col2)));
+	}
+
+	static Cm::SpatialVector multiply(const FsInertia& m, const Cm::SpatialVector& v)
+	{
+		return Cm::SpatialVector(multiply(m.ll,v.linear) + multiply(m.la,v.angular),
+								 multiplyTranspose(m.la, v.linear) + multiply(m.aa, v.angular));
+	}
+
+	static PX_FORCE_INLINE Cm::SpatialVector getRootDeltaV(const FsData& matrix, const Cm::SpatialVector& Z)
+	{
+		return multiply(getRootInverseInertia(matrix), Z);
+	}
+};
+
+}
+
+}
+
+#endif //DY_ARTICULATION_SCALAR_FNS_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsSimd.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsSimd.h
new file mode 100644
index 00000000..182abc66
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationFnsSimd.h
@@ -0,0 +1,438 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_SIMD_FNS_H
+#define DY_ARTICULATION_SIMD_FNS_H
+
+#include "DyArticulationUtils.h"
+
+namespace physx
+{
+namespace Dy
+{
+
+template <typename T, PxU32 count>
+class PodULike
+{
+	PxU8 space[sizeof(T)*count];
+public:
+	PX_FORCE_INLINE operator T*() { return reinterpret_cast<T*>(space); }
+};
+
+#define POD_U_LIKE(_T, _count, _alignment) PX_ALIGN_PREFIX(_alignment) PodULike<_T, _count> PX_ALIGN_SUFFIX(_alignment)
+
+class ArticulationFnsSimdBase
+{
+public:
+
+	static PX_FORCE_INLINE	FsInertia addInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Add(in1.ll, in2.ll),
+							M33Add(in1.la, in2.la),
+							M33Add(in1.aa, in2.aa));
+	}
+
+	static PX_FORCE_INLINE	FsInertia subtractInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Sub(in1.ll, in2.ll),
+							M33Sub(in1.la, in2.la),
+							M33Sub(in1.aa, in2.aa));
+	}
+
+	static PX_FORCE_INLINE Vec3V axisDot(const Cm::SpatialVectorV S[3], const Cm::SpatialVectorV &v)
+	{
+		return V3Merge(FAdd(V3Dot(S[0].linear,v.linear), V3Dot(S[0].angular,v.angular)),
+					   FAdd(V3Dot(S[1].linear,v.linear), V3Dot(S[1].angular,v.angular)),
+					   FAdd(V3Dot(S[2].linear,v.linear), V3Dot(S[2].angular,v.angular)));
+	}
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV axisMultiply(const Cm::SpatialVectorV S[3], Vec3V v)
+	{
+		return Cm::SpatialVectorV(V3ScaleAdd(S[0].linear, V3GetX(v), V3ScaleAdd(S[1].linear, V3GetY(v), V3Scale(S[2].linear, V3GetZ(v)))),
+							  V3ScaleAdd(S[0].angular, V3GetX(v), V3ScaleAdd(S[1].angular, V3GetY(v), V3Scale(S[2].angular, V3GetZ(v)))));
+	}
+
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV subtract(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b)
+	{
+		return Cm::SpatialVectorV(V3Sub(a.linear, b.linear), V3Sub(a.angular, b.angular));
+	}
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV add(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b)
+	{
+		return Cm::SpatialVectorV(V3Add(a.linear, b.linear), V3Add(a.angular, b.angular));
+	}
+
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV multiply(const FsInertia &I, const Cm::SpatialVectorV &S)
+	{
+		return Cm::SpatialVectorV(V3Add(M33MulV3(I.ll,S.linear),	   M33MulV3(I.la,S.angular)),
+							  V3Add(M33TrnspsMulV3(I.la,S.linear), M33MulV3(I.aa,S.angular)));
+	}
+
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV translateMotion(const Vec3V& p, const Cm::SpatialVectorV& v)
+	{
+		return Cm::SpatialVectorV(V3Add(v.linear, V3Cross(p, v.angular)), v.angular);
+	}
+
+	// translate a force resolved at position p to the origin
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV translateForce(const Vec3V& p, const Cm::SpatialVectorV& v)
+	{
+		return Cm::SpatialVectorV(v.linear, V3Add(v.angular, V3Cross(p, v.linear)));
+	}
+
+	static PX_FORCE_INLINE Mat33V invertSym33(const Mat33V &m)
+	{
+		Vec3V a0 = V3Cross(m.col1, m.col2);
+		Vec3V a1 = V3Cross(m.col2, m.col0);
+		Vec3V a2 = V3Cross(m.col0, m.col1);
+		FloatV det = V3Dot(a0, m.col0);
+		FloatV recipDet = FRecip(det);
+
+		a1 = V3SetX(a1, V3GetY(a0));
+		a2 = V3Merge(V3GetZ(a0), V3GetZ(a1), V3GetZ(a2));		// make sure it's symmetric
+
+		return Mat33V(V3Scale(a0, recipDet),
+					  V3Scale(a1, recipDet),
+					  V3Scale(a2, recipDet));
+	}
+
+
+	static PX_FORCE_INLINE FloatV safeInvSqrt(FloatV v)
+	{
+		return FSqrt(FMax(FZero(), FRecip(v)));
+	}
+	static PX_FORCE_INLINE Mat33V invSqrt(const Mat33V& m)
+	{
+		// cholesky factor to 
+		// (a 0 0)
+		// (b c 0)
+		// (d e f)
+		// except that a,c,f are the reciprocal sqrts rather than sqrts
+
+		// PxVec3 v0 = m.column0, v1 = m.column1, v2 = m.column2;
+		Vec3V v0 = m.col0, v1 = m.col1, v2 = m.col2;
+
+		const FloatV x0 = V3GetX(v0), y1 = V3GetY(v1), z2 = V3GetZ(v2);
+		
+		FloatV a	= safeInvSqrt(x0);								// PxReal a = PxRecipSqrt(v0.x);
+	
+		Vec3V abd	= V3Scale(v0, a);								// PxReal b = v0.y*a;
+		FloatV b	= V3GetY(abd);
+
+		FloatV c2	= FNegScaleSub(b, b, y1);						// PxReal c = PxRecipSqrt(v1.y - b*b);
+		FloatV c	= safeInvSqrt(c2);
+
+		FloatV d	= V3GetZ(abd);									// PxReal d = v0.z*a;
+		
+		FloatV e	= FMul(FNegScaleSub(b, d, V3GetZ(v1)), c);		// PxReal e = (v1.z-d*b) * c;
+
+		FloatV f2	= FNegScaleSub(d, d, FNegScaleSub(e, e, z2));	// PxReal f = PxRecipSqrt(v2.z - d*d - e*e);
+		FloatV f	= safeInvSqrt(f2);
+
+		// invert 
+		FloatV x = FMul(FMul(b,a),c),								// x = -b*a*c
+			   y = FMul((FNegScaleSub(d,a, FMul(e,x))), f),			// y = (-e*x-d*a)*f 
+			   z = FMul(e, FMul(c,f));								// z = -e*c*f
+
+		return Mat33V(V3Merge(a,		FZero(),		FZero()),
+					  V3Merge(FNeg(x),	c,				FZero()),
+					  V3Merge(y,		FNeg(z),		f));
+	}
+
+
+	static PX_FORCE_INLINE FsInertia invertInertia(const FsInertia &I)
+	{
+		Mat33V aa = M33Scale(M33Add(I.aa, M33Trnsps(I.aa)), FHalf());
+		Mat33V ll = M33Scale(M33Add(I.ll, M33Trnsps(I.ll)), FHalf());
+
+		Mat33V AAInv = invertSym33(aa);
+		Mat33V z = M33MulM33(M33Neg(I.la), AAInv);
+		Mat33V S = M33Add(ll, M33MulM33(z, M33Trnsps(I.la)));
+
+		Mat33V LL = invertSym33(S);
+		Mat33V LA = M33MulM33(LL, z);
+		Mat33V AA = M33Add(AAInv, M33MulM33(M33Trnsps(z), LA));
+
+		return FsInertia(LL, LA, AA);
+	}
+
+	static PX_NOINLINE Mat33V computeSIS(const FsInertia &I, const Cm::SpatialVectorV S[3], Cm::SpatialVectorV IS[3])
+	{
+		Vec3V S0l = S[0].linear, S0a = S[0].angular;
+		Vec3V S1l = S[1].linear, S1a = S[1].angular;
+		Vec3V S2l = S[2].linear, S2a = S[2].angular;
+
+		Vec3V IS0l = V3Add(M33MulV3(I.ll,S0l), M33MulV3(I.la,S0a));
+		Vec3V IS0a = V3Add(M33TrnspsMulV3(I.la,S0l), M33MulV3(I.aa,S0a));
+		Vec3V IS1l = V3Add(M33MulV3(I.ll,S1l), M33MulV3(I.la,S1a));
+		Vec3V IS1a = V3Add(M33TrnspsMulV3(I.la,S1l), M33MulV3(I.aa,S1a));
+		Vec3V IS2l = V3Add(M33MulV3(I.ll,S2l), M33MulV3(I.la,S2a));
+		Vec3V IS2a = V3Add(M33TrnspsMulV3(I.la,S2l), M33MulV3(I.aa,S2a));
+
+		// compute SIS
+		FloatV a00 = FAdd(V3Dot(S0l, IS0l), V3Dot(S0a, IS0a));
+		FloatV a01 = FAdd(V3Dot(S0l, IS1l), V3Dot(S0a, IS1a));
+		FloatV a02 = FAdd(V3Dot(S0l, IS2l), V3Dot(S0a, IS2a));
+		FloatV a11 = FAdd(V3Dot(S1l, IS1l), V3Dot(S1a, IS1a));
+		FloatV a12 = FAdd(V3Dot(S1l, IS2l), V3Dot(S1a, IS2a));
+		FloatV a22 = FAdd(V3Dot(S2l, IS2l), V3Dot(S2a, IS2a));
+
+		// write IS, a useful side-effect
+		IS[0].linear = IS0l; IS[0].angular = IS0a;
+		IS[1].linear = IS1l; IS[1].angular = IS1a;
+		IS[2].linear = IS2l; IS[2].angular = IS2a;
+
+		return Mat33V(V3Merge(a00, a01, a02),
+					  V3Merge(a01, a11, a12),
+					  V3Merge(a02, a12, a22));
+	}
+
+
+	static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Mat33V &D, const Cm::SpatialVectorV IS[3], Cm::SpatialVectorV DSI[3])
+	{
+		// cut'n'paste, how I love ya, how I love ya
+
+		Vec3V IS0l = IS[0].linear, IS0a = IS[0].angular;
+		Vec3V IS1l = IS[1].linear, IS1a = IS[1].angular;
+		Vec3V IS2l = IS[2].linear, IS2a = IS[2].angular;
+
+		Vec3V D0 = D.col0, D1 = D.col1, D2 = D.col2;
+
+		// compute IDS
+		Vec3V DSI0l = V3ScaleAdd(IS0l, V3GetX(D0), V3ScaleAdd(IS1l, V3GetY(D0), V3Scale(IS2l, V3GetZ(D0))));
+		Vec3V DSI1l = V3ScaleAdd(IS0l, V3GetX(D1), V3ScaleAdd(IS1l, V3GetY(D1), V3Scale(IS2l, V3GetZ(D1))));
+		Vec3V DSI2l = V3ScaleAdd(IS0l, V3GetX(D2), V3ScaleAdd(IS1l, V3GetY(D2), V3Scale(IS2l, V3GetZ(D2))));
+
+		Vec3V DSI0a = V3ScaleAdd(IS0a, V3GetX(D0), V3ScaleAdd(IS1a, V3GetY(D0), V3Scale(IS2a, V3GetZ(D0))));
+		Vec3V DSI1a = V3ScaleAdd(IS0a, V3GetX(D1), V3ScaleAdd(IS1a, V3GetY(D1), V3Scale(IS2a, V3GetZ(D1))));
+		Vec3V DSI2a = V3ScaleAdd(IS0a, V3GetX(D2), V3ScaleAdd(IS1a, V3GetY(D2), V3Scale(IS2a, V3GetZ(D2))));
+
+		// compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad
+
+		Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2;
+		Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2;
+		Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2;
+
+#define SUBTRACT_DYAD(_a, _b) \
+	ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0);	la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0);	aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \
+	ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1);	la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1);	aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \
+	ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2);	la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2);	aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2); 
+
+		SUBTRACT_DYAD(IS0, DSI0);
+		SUBTRACT_DYAD(IS1, DSI1);
+		SUBTRACT_DYAD(IS2, DSI2);
+#undef SUBTRACT_DYAD
+
+		DSI[0].linear = DSI0l;	DSI[0].angular = DSI0a;
+		DSI[1].linear = DSI1l;	DSI[1].angular = DSI1a;
+		DSI[2].linear = DSI2l;	DSI[2].angular = DSI2a;
+
+		return FsInertia(Mat33V(ll0, ll1, ll2), 
+							Mat33V(la0, la1, la2),
+							Mat33V(aa0, aa1, aa2));
+	} 
+
+
+	static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Cm::SpatialVectorV S[3])
+	{
+		// cut'n'paste, how I love ya, how I love ya
+
+		const Vec3V S0l = S[0].linear, S0a = S[0].angular;
+		const Vec3V S1l = S[1].linear, S1a = S[1].angular;
+		const Vec3V S2l = S[2].linear, S2a = S[2].angular;
+
+		// compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad
+
+		Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2;
+		Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2;
+		Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2;
+
+#define SUBTRACT_DYAD(_a, _b) \
+	ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0);	la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0);	aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \
+	ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1);	la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1);	aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \
+	ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2);	la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2);	aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2); 
+
+	SUBTRACT_DYAD(S0, S0);
+	SUBTRACT_DYAD(S1, S1);
+	SUBTRACT_DYAD(S2, S2);
+#undef SUBTRACT_DYAD
+
+		return FsInertia(Mat33V(ll0, ll1, ll2), 
+							Mat33V(la0, la1, la2),
+							Mat33V(aa0, aa1, aa2));
+	} 
+
+
+	static PX_FORCE_INLINE FsInertia translateInertia(Vec3V a, const FsInertia &input)
+	{
+		Vec3V b = V3Neg(a);
+		
+		Vec3V la0 = input.la.col0, la1 = input.la.col1, la2 = input.la.col2;
+		Vec3V ll0 = input.ll.col0, ll1 = input.ll.col1, ll2 = input.ll.col2;
+		Vec3V aa0 = input.aa.col0, aa1 = input.aa.col1, aa2 = input.aa.col2;
+
+		FloatV aX = V3GetX(a), aY = V3GetY(a), aZ = V3GetZ(a);
+		FloatV bX = V3GetX(b), bY = V3GetY(b), bZ = V3GetZ(b);
+		FloatV Z = FZero();
+
+		// s - star matrix of a
+		Vec3V s0 = V3Merge(Z, aZ, bY),
+			  s1 = V3Merge(bZ, Z, aX),
+			  s2 = V3Merge(aY, bX, Z);
+
+		// s * la
+		Vec3V sla0 = V3ScaleAdd(s0, V3GetX(la0), V3ScaleAdd(s1, V3GetY(la0), V3Scale(s2, V3GetZ(la0))));
+		Vec3V sla1 = V3ScaleAdd(s0, V3GetX(la1), V3ScaleAdd(s1, V3GetY(la1), V3Scale(s2, V3GetZ(la1))));
+		Vec3V sla2 = V3ScaleAdd(s0, V3GetX(la2), V3ScaleAdd(s1, V3GetY(la2), V3Scale(s2, V3GetZ(la2))));
+
+		// ll * s.transpose() (ll is symmetric)
+		Vec3V llst0 = V3ScaleAdd(ll2, aY, V3Scale(ll1, bZ)),
+			  llst1 = V3ScaleAdd(ll0, aZ, V3Scale(ll2, bX)),
+			  llst2 = V3ScaleAdd(ll1, aX, V3Scale(ll0, bY));
+
+		// t = sla+S*llst*0.5f;
+
+		Vec3V sllst0 = V3ScaleAdd(s2, V3GetZ(llst0), V3ScaleAdd(s1, V3GetY(llst0), V3Scale(s0, V3GetX(llst0))));
+		Vec3V sllst1 = V3ScaleAdd(s2, V3GetZ(llst1), V3ScaleAdd(s1, V3GetY(llst1), V3Scale(s0, V3GetX(llst1))));
+		Vec3V sllst2 = V3ScaleAdd(s2, V3GetZ(llst2), V3ScaleAdd(s1, V3GetY(llst2), V3Scale(s0, V3GetX(llst2))));
+
+		Vec3V t0 = V3ScaleAdd(sllst0, FHalf(), sla0);
+		Vec3V t1 = V3ScaleAdd(sllst1, FHalf(), sla1);
+		Vec3V t2 = V3ScaleAdd(sllst2, FHalf(), sla2);
+		
+		// t+t.transpose()
+		Vec3V r0 = V3Add(t0, V3Merge(V3GetX(t0), V3GetX(t1), V3GetX(t2))),
+			  r1 = V3Add(t1, V3Merge(V3GetY(t0), V3GetY(t1), V3GetY(t2))),
+			  r2 = V3Add(t2, V3Merge(V3GetZ(t0), V3GetZ(t1), V3GetZ(t2)));
+
+		return FsInertia(Mat33V(ll0, ll1, ll2),
+
+							Mat33V(V3Add(la0, llst0),
+								   V3Add(la1, llst1),
+								   V3Add(la2, llst2)),
+
+							Mat33V(V3Add(aa0, r0),
+								   V3Add(aa1, r1),
+								   V3Add(aa2, r2)));
+	}
+
+};
+
+template<class Base>
+class ArticulationFnsSimd : public Base
+{
+	static PX_FORCE_INLINE void axisMultiplyLowerTriangular(Cm::SpatialVectorV ES[3], const Mat33V&E, const Cm::SpatialVectorV S[3])
+	{
+		const Vec3V l0 = S[0].linear,  l1 = S[1].linear,  l2 = S[2].linear;
+		const Vec3V a0 = S[0].angular, a1 = S[1].angular, a2 = S[2].angular;
+		ES[0] = Cm::SpatialVectorV(V3Scale(l0, V3GetX(E.col0)),
+							   V3Scale(a0, V3GetX(E.col0)));
+		ES[1] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col1), V3Scale(l1, V3GetY(E.col1))),
+							   V3ScaleAdd(a0, V3GetX(E.col1), V3Scale(a1, V3GetY(E.col1))));
+		ES[2] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col2), V3ScaleAdd(l1, V3GetY(E.col2), V3Scale(l2, V3GetZ(E.col2)))),
+							   V3ScaleAdd(a0, V3GetX(E.col2), V3ScaleAdd(a1, V3GetY(E.col2), V3Scale(a2, V3GetZ(E.col2)))));
+	}
+
+public:
+	static PX_FORCE_INLINE FsInertia propagate(const FsInertia &I,
+												  const Cm::SpatialVectorV S[3],
+												  const Mat33V &load,
+												  const FloatV isf)
+	{
+		Cm::SpatialVectorV IS[3], ISE[3];
+		Mat33V D = Base::computeSIS(I, S, IS);
+
+		D.col0 = V3ScaleAdd(load.col0, isf, D.col0);
+		D.col1 = V3ScaleAdd(load.col1, isf, D.col1);
+		D.col2 = V3ScaleAdd(load.col2, isf, D.col2);
+
+		axisMultiplyLowerTriangular(ISE, Base::invSqrt(D), IS);
+		return Base::multiplySubtract(I, ISE);
+	}
+
+
+
+	static PX_INLINE Cm::SpatialVectorV propagateImpulse(const FsRow& row, 
+													 const FsJointVectors& jv,
+												     Vec3V& SZ,
+												     const Cm::SpatialVectorV& Z,
+												     const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+
+		SZ = V3Add(Z.angular, V3Cross(Z.linear, jv.jointOffset));
+		return Base::translateForce(jv.parentOffset, Z - Base::axisMultiply(row.DSI, SZ));
+	}
+
+	static PX_INLINE Cm::SpatialVectorV propagateVelocity(const FsRow& row, 
+													  const FsJointVectors& jv,
+													  const Vec3V& SZ, 
+													  const Cm::SpatialVectorV& v,
+													  const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+
+		Cm::SpatialVectorV w = Base::translateMotion(V3Neg(jv.parentOffset), v);
+		Vec3V DSZ = M33MulV3(row.D, SZ);
+
+		Vec3V n = V3Add(Base::axisDot(row.DSI, w), DSZ);
+		return w - Cm::SpatialVectorV(V3Cross(jv.jointOffset, n), n);
+	}
+
+
+
+
+
+	static PX_FORCE_INLINE Mat33V computeDriveInertia(const FsInertia &I0, 
+													  const FsInertia &I1, 
+													  const Cm::SpatialVectorV S[3])
+	{
+		POD_U_LIKE(Cm::SpatialVectorV, 3, 16) IS, ISD, dummy;		
+		Mat33V D = Base::computeSIS(I0, S, IS);
+		Mat33V DInv = Base::invertSym33(D);
+
+		FsInertia tmp = Base::addInertia(I0, I1);
+		tmp = Base::multiplySubtract(tmp, DInv, IS, ISD);
+		FsInertia J = Base::invertInertia(tmp);
+
+		Mat33V E = Base::computeSIS(J, ISD, dummy);
+		return Base::invertSym33(M33Add(DInv,E));
+
+	}
+};
+
+}
+}
+
+#endif //DY_ARTICULATION_SIMD_FNS_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationHelper.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationHelper.cpp
new file mode 100644
index 00000000..ea9ccb8d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationHelper.cpp
@@ -0,0 +1,1344 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxMemory.h"
+#include "foundation/PxProfiler.h"
+
+#include "PsUtilities.h"
+#include "CmSpatialVector.h"
+#include "DyArticulationHelper.h"
+#include "DyArticulationReference.h"
+#include "DyArticulationFnsSimd.h"
+#include "DyArticulationFnsScalar.h"
+#include "DyArticulationFnsDebug.h"
+#include "DySolverConstraintDesc.h"
+#include "PxvDynamics.h"
+#include "DyArticulation.h"
+#include "PxcRigidBody.h"
+#include "CmConeLimitHelper.h"
+#include "DySolverConstraint1D.h"
+#include "PxcConstraintBlockStream.h"
+#include "DySolverConstraint1D.h"
+#include "DyArticulationPImpl.h"
+#include "PsFoundation.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+void PxcFsFlushVelocity(FsData& matrix);
+
+// we pass this around by value so that when we return from a function the size is unaltered. That means we don't preserve state
+// across functions - even though that could be handy to preserve baseInertia and jointTransforms across the solver so that if we 
+// need to run position projection  positions they don't get recomputed.
+
+struct PxcFsScratchAllocator
+{
+	char*   base;
+	size_t	size;
+	size_t	taken;
+	PxcFsScratchAllocator(char* p, size_t s): base(p), size(s), taken(0) {}
+
+	template<typename T>
+	static size_t sizeof16()
+	{
+		return (sizeof(T)+15)&~15;
+	}
+
+	template<class T> T* alloc(PxU32 count)
+	{
+		size_t s = sizeof16<T>();
+		PX_ASSERT(taken+s*count <= size);
+		T* result = reinterpret_cast<T*>(base+taken);
+		taken+=s*count;
+		return result;
+	}
+};
+
+void PxcLtbFactor(FsData& m)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+	LtbRow* rows = getLtbRows(m);
+
+	for(PxU32 i=m.linkCount; --i>0;)
+	{
+		LtbRow& b = rows[i];
+		PxU32 p = m.parent[i];
+		const FsInertia inertia = Fns::invertInertia(b.inertia);
+		const Mat33V jResponse = Fns::invertSym33(M33Neg(Fns::computeSIS(inertia, b.j1, b.j1)));
+		b.inertia = inertia;
+		rows[p].inertia = Fns::multiplySubtract(rows[p].inertia, jResponse, b.j0, b.j0);
+		b.jResponse = jResponse;
+
+	}
+	rows[0].inertia = Fns::invertInertia(rows[0].inertia);
+}
+
+void PxcLtbSolve(const FsData& m, 
+				 Vec3V* b,					// rhs error to solve for
+				 Cm::SpatialVectorV* y)		// velocity delta output
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	const LtbRow* rows = getLtbRows(m);
+	PxMemZero(y, m.linkCount*sizeof(Cm::SpatialVectorV));
+
+	for(PxU32 i=m.linkCount;i-->1;)
+	{
+		const LtbRow& r = rows[i];
+		const PxU32 p = m.parent[i];
+
+		const Vec3V t = V3Sub(b[i], Fns::axisDot(r.j1, y[i]));
+		b[i] = t;
+		y[p] = Fns::subtract(y[p], Fns::axisMultiply(r.j0, t));
+	}
+
+	y[0] = Fns::multiply(rows[0].inertia, y[0]);
+
+	for(PxU32 i=1; i<m.linkCount; i++)
+	{
+		const LtbRow& r = rows[i];
+		const PxU32 p = m.parent[i];
+
+		const Vec3V t = V3Sub(M33MulV3(r.jResponse, b[i]), Fns::axisDot(r.j0, y[p]));
+		y[i] = Fns::subtract(Fns::multiply(r.inertia, y[i]), Fns::axisMultiply(r.j1, t));
+	}
+}
+
+void PxcLtbProject(const FsData& m,
+				   Cm::SpatialVectorV* velocity,
+				   Vec3V* b)
+{
+	PX_ASSERT(m.linkCount<=DY_ARTICULATION_MAX_SIZE);
+	Cm::SpatialVectorV y[DY_ARTICULATION_MAX_SIZE];
+
+	PxcLtbSolve(m, b, y);
+
+	for(PxU32 i=0;i<m.linkCount;i++)
+		velocity[i] -= y[i];
+}
+
+void PxcFsPropagateDrivenInertiaSimd(FsData& matrix,
+									 const FsInertia* baseInertia,
+									 const PxReal* isf,
+									 const Mat33V* load,
+									 PxcFsScratchAllocator allocator)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	Cm::SpatialVectorV IS[3];
+
+	FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	FsInertia* inertia = allocator.alloc<FsInertia>(matrix.linkCount);
+	PxMemCopy(inertia, baseInertia, matrix.linkCount*sizeof(FsInertia));
+
+	for(PxU32 i=matrix.linkCount; --i>0;)
+	{
+		FsRow& r = rows[i];
+		const FsRowAux& a = aux[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+		const Mat33V m = Fns::computeSIS(inertia[i], a.S, IS);
+		const FloatV f = FLoad(isf[i]);
+
+		const Mat33V D = Fns::invertSym33(Mat33V(V3ScaleAdd(load[i].col0, f, m.col0),
+										   V3ScaleAdd(load[i].col1, f, m.col1),
+										   V3ScaleAdd(load[i].col2, f, m.col2)));
+		r.D = D;
+
+		inertia[matrix.parent[i]] = Fns::addInertia(inertia[matrix.parent[i]], 
+													Fns::translateInertia(jv.parentOffset, Fns::multiplySubtract(inertia[i], D,  IS,  r.DSI)));
+	}
+
+	getRootInverseInertia(matrix) = Fns::invertInertia(inertia[0]);
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV propagateDrivenImpulse(const FsRow& row, 
+														  const FsJointVectors& jv,
+														  Vec3V& SZMinusQ, 
+														  const Cm::SpatialVectorV& Z,
+														  const Vec3V& Q)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	SZMinusQ = V3Sub(V3Add(Z.angular, V3Cross(Z.linear,jv.jointOffset)), Q);
+	Cm::SpatialVectorV result = Fns::translateForce(jv.parentOffset, Z - Fns::axisMultiply(row.DSI, SZMinusQ));
+
+	return result;
+}
+
+void PxcFsApplyJointDrives(FsData& matrix,
+						   const Vec3V* Q)
+{				
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	Cm::SpatialVectorV Z[DY_ARTICULATION_MAX_SIZE];
+	Cm::SpatialVectorV dV[DY_ARTICULATION_MAX_SIZE];
+	Vec3V SZminusQ[DY_ARTICULATION_MAX_SIZE];
+
+	PxMemZero(Z, matrix.linkCount*sizeof(Cm::SpatialVectorV));
+
+	for(PxU32 i=matrix.linkCount;i-->1;)
+		Z[matrix.parent[i]] += propagateDrivenImpulse(rows[i], jointVectors[i], SZminusQ[i], Z[i], Q[i]);
+
+	
+	dV[0] = Fns::multiply(getRootInverseInertia(matrix), -Z[0]);
+
+	for(PxU32 i=1;i<matrix.linkCount;i++)
+		dV[i] = Fns::propagateVelocity(rows[i], jointVectors[i], SZminusQ[i], dV[matrix.parent[i]], aux[i]);
+
+	Cm::SpatialVectorV* V = getVelocity(matrix);
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+		V[i] += dV[i];
+}
+
+void ArticulationHelper::applyImpulses(	const FsData& matrix,
+										Cm::SpatialVectorV* Z,
+										Cm::SpatialVectorV* V)
+{	
+	// note: Z is the negated impulse
+
+
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	Cm::SpatialVectorV dV[DY_ARTICULATION_MAX_SIZE];
+	Vec3V SZ[DY_ARTICULATION_MAX_SIZE];
+
+	for(PxU32 i=matrix.linkCount;i-->1;)
+		Z[matrix.parent[i]] += Fns::propagateImpulse(rows[i], jointVectors[i], SZ[i], Z[i], aux[i]);
+
+	dV[0] = Fns::multiply(getRootInverseInertia(matrix), -Z[0]);
+
+	for(PxU32 i=1;i<matrix.linkCount;i++)
+		dV[i] = Fns::propagateVelocity(rows[i], jointVectors[i], SZ[i], dV[matrix.parent[i]], aux[i]);
+
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+		V[i] += dV[i];
+}
+
+void getImpulseResponseSlow(const FsData& matrix, 
+							PxU32 linkID0, 
+							const Cm::SpatialVectorV& impulse0,
+							Cm::SpatialVectorV& deltaV0,
+							PxU32 linkID1,
+							const Cm::SpatialVectorV& impulse1,
+							Cm::SpatialVectorV& deltaV1)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+	PxU32 stack[DY_ARTICULATION_MAX_SIZE];
+	Vec3V SZ[DY_ARTICULATION_MAX_SIZE];
+
+	PxU32 i0, i1, ic;
+	
+	for(i0 = linkID0, i1 = linkID1; i0!=i1;)	// find common path
+	{
+		if(i0<i1)
+			i1 = matrix.parent[i1];
+		else
+			i0 = matrix.parent[i0];
+	}
+
+	PxU32 common = i0;
+
+	Cm::SpatialVectorV Z0 = -impulse0, Z1 = -impulse1;
+	for(i0 = 0; linkID0!=common; linkID0 = matrix.parent[linkID0])
+	{
+		Z0 = Fns::propagateImpulse(rows[linkID0], jointVectors[linkID0], SZ[linkID0], Z0, aux[linkID0]);
+		stack[i0++] = linkID0;
+	}
+
+	for(i1 = i0; linkID1!=common; linkID1 = matrix.parent[linkID1])
+	{
+		Z1 = Fns::propagateImpulse(rows[linkID1], jointVectors[linkID1], SZ[linkID1], Z1, aux[linkID1]);
+		stack[i1++] = linkID1;
+	}
+
+	Cm::SpatialVectorV Z = Z0 + Z1;
+	for(ic = i1; common; common = matrix.parent[common])
+	{
+		Z = Fns::propagateImpulse(rows[common], jointVectors[common], SZ[common], Z, aux[common]);
+		stack[ic++] = common;
+	}
+
+	Cm::SpatialVectorV v = Fns::multiply(getRootInverseInertia(matrix), -Z);
+
+	for(PxU32 index = ic; index-->i1 ;)
+		v = Fns::propagateVelocity(rows[stack[index]], jointVectors[stack[index]], SZ[stack[index]], v, aux[stack[index]]);
+
+	deltaV1 = v;
+	for(PxU32 index = i1; index-->i0 ;)
+		deltaV1 = Fns::propagateVelocity(rows[stack[index]], jointVectors[stack[index]], SZ[stack[index]], deltaV1, aux[stack[index]]);
+
+	deltaV0 = v;
+	for(PxU32 index = i0; index-->0;)
+		deltaV0 = Fns::propagateVelocity(rows[stack[index]], jointVectors[stack[index]], SZ[stack[index]], deltaV0, aux[stack[index]]);
+}
+
+void PxcFsGetImpulseResponse(const FsData& matrix,
+							 PxU32 linkID,
+							 const Cm::SpatialVectorV& impulse,
+							 Cm::SpatialVectorV& deltaV)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+	Vec3V SZ[DY_ARTICULATION_MAX_SIZE];
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	Cm::SpatialVectorV Z = -impulse;
+	
+	for(PxU32 i = linkID; i; i = matrix.parent[i])
+		Z = Fns::propagateImpulse(rows[i], jointVectors[i], SZ[i], Z, aux[i]);
+
+	deltaV = Fns::multiply(getRootInverseInertia(matrix), -Z);
+
+	PX_ASSERT(rows[linkID].pathToRoot&1);
+
+	for(ArticulationBitField i=rows[linkID].pathToRoot-1; i; i &= (i-1))
+	{
+		const PxU32 index = ArticulationLowestSetBit(i);
+		deltaV = Fns::propagateVelocity(rows[index], jointVectors[index], SZ[index], deltaV, aux[index]);
+	}
+}
+
+void PxcFsGetImpulseSelfResponse(const FsData& matrix, 
+								 PxU32 linkID0, 
+								 const Cm::SpatialVectorV& impulse0,
+								 Cm::SpatialVectorV& deltaV0,
+								 PxU32 linkID1,
+								 const Cm::SpatialVectorV& impulse1,
+								 Cm::SpatialVectorV& deltaV1)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	PX_ASSERT(linkID0 != linkID1);
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	// standard case: parent-child limit
+	if(matrix.parent[linkID1] == linkID0)
+	{
+		Vec3V SZ;
+		const Cm::SpatialVectorV Z = impulse0 - Fns::propagateImpulse(rows[linkID1], jointVectors[linkID1], SZ, -impulse1, aux[linkID1]);
+		PxcFsGetImpulseResponse(matrix, linkID0, Z, deltaV0);
+		deltaV1 = Fns::propagateVelocity(rows[linkID1], jointVectors[linkID1], SZ, deltaV0, aux[linkID1]);
+	}
+	else
+		getImpulseResponseSlow(matrix, linkID0, impulse0, deltaV0, linkID1, impulse1, deltaV1);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector V[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<matrix.linkCount;i++) V[i] = Cm::SpatialVector::zero();
+	ArticulationRef::applyImpulse(matrix,V,linkID0, reinterpret_cast<const Cm::SpatialVector&>(impulse0));
+	ArticulationRef::applyImpulse(matrix,V,linkID1, reinterpret_cast<const Cm::SpatialVector&>(impulse1));
+
+	Cm::SpatialVector refV0 = V[linkID0];
+	Cm::SpatialVector refV1 = V[linkID1];
+#endif
+}
+
+namespace
+{
+
+	PX_FORCE_INLINE Cm::SpatialVectorV getImpulseResponseSimd(const FsData& matrix, PxU32 linkID, Vec3V lZ, Vec3V aZ)
+	{
+		PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+		Vec3V SZ[DY_ARTICULATION_MAX_SIZE];
+		PxU32 indices[DY_ARTICULATION_MAX_SIZE], iCount = 0;
+
+		const FsRow*PX_RESTRICT rows = getFsRows(matrix);
+		const FsRowAux*PX_RESTRICT aux = getAux(matrix);
+		const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+		PX_UNUSED(aux);
+		PX_ASSERT(rows[linkID].pathToRoot&1);
+
+		lZ = V3Neg(lZ);
+		aZ = V3Neg(aZ);
+
+		for(PxU32 i = linkID; i; i = matrix.parent[i])
+		{
+			const FsRow& r = rows[i];
+			const FsJointVectors& j = jointVectors[i];
+
+			Vec3V sz = V3Add(aZ, V3Cross(lZ, j.jointOffset));
+			SZ[iCount] = sz;
+			
+			lZ = V3NegScaleSub(r.DSI[0].linear, V3GetX(sz), V3NegScaleSub(r.DSI[1].linear, V3GetY(sz), V3NegScaleSub(r.DSI[2].linear, V3GetZ(sz), lZ)));
+			aZ = V3NegScaleSub(r.DSI[0].angular, V3GetX(sz), V3NegScaleSub(r.DSI[1].angular, V3GetY(sz), V3NegScaleSub(r.DSI[2].angular, V3GetZ(sz), aZ)));
+
+			aZ = V3Add(aZ, V3Cross(j.parentOffset, lZ));
+			indices[iCount++] = i;
+		}
+
+		const FsInertia& I = getRootInverseInertia(matrix);
+
+		Vec3V lV = V3Neg(V3Add(M33MulV3(I.ll, lZ), M33MulV3(I.la, aZ)));
+		Vec3V aV = V3Neg(V3Add(M33TrnspsMulV3(I.la, lZ), M33MulV3(I.aa, aZ)));
+
+		while(iCount)
+		{
+			PxU32 i = indices[--iCount];
+			const FsRow& r = rows[i];
+			const FsJointVectors& j = jointVectors[i];
+
+			lV = V3Sub(lV, V3Cross(j.parentOffset, aV));
+
+			Vec3V n = V3Add(V3Merge(V3Dot(r.DSI[0].linear, lV),  V3Dot(r.DSI[1].linear, lV),  V3Dot(r.DSI[2].linear, lV)),
+							V3Merge(V3Dot(r.DSI[0].angular, aV), V3Dot(r.DSI[1].angular, aV), V3Dot(r.DSI[2].angular, aV)));
+
+			n = V3Add(n, M33MulV3(r.D, SZ[iCount]));
+			lV = V3Sub(lV, V3Cross(j.jointOffset, n));
+			aV = V3Sub(aV, n);
+		}
+
+		return Cm::SpatialVectorV(lV, aV);
+	}
+}
+					
+void ArticulationHelper::getImpulseResponse(const FsData& matrix,
+											PxU32 linkID,
+											const Cm::SpatialVectorV& impulse,
+											Cm::SpatialVectorV& deltaV)
+{
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+
+	deltaV = getImpulseResponseSimd(matrix, linkID, impulse.linear, impulse.angular);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVectorV deltaV_;
+	PxcFsGetImpulseResponse(matrix, linkID, impulse, deltaV_);
+	PX_ASSERT(almostEqual(deltaV_, deltaV,1e-3f));
+#endif
+}
+
+void ArticulationHelper::getImpulseSelfResponse(const FsData& matrix,
+												PxU32 linkID0,
+												const Cm::SpatialVectorV& impulse0,
+												Cm::SpatialVectorV& deltaV0,
+												PxU32 linkID1,
+												const Cm::SpatialVectorV& impulse1,
+												Cm::SpatialVectorV& deltaV1)
+{
+	PX_ASSERT(linkID0 != linkID1);
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_UNUSED(aux);
+
+	Cm::SpatialVectorV& dV0 = deltaV0, 
+				  & dV1 = deltaV1;
+
+	// standard case: parent-child limit
+	if(matrix.parent[linkID1] == linkID0)
+	{
+		const FsRow& r = rows[linkID1];
+		const FsJointVectors& j = jointVectors[linkID1];
+
+		Vec3V lZ = V3Neg(impulse1.linear),
+			  aZ = V3Neg(impulse1.angular);
+
+		Vec3V sz = V3Add(aZ, V3Cross(lZ, j.jointOffset));
+		
+		lZ = V3Sub(lZ, V3ScaleAdd(r.DSI[0].linear, V3GetX(sz), V3ScaleAdd(r.DSI[1].linear, V3GetY(sz), V3Scale(r.DSI[2].linear, V3GetZ(sz)))));
+		aZ = V3Sub(aZ, V3ScaleAdd(r.DSI[0].angular, V3GetX(sz), V3ScaleAdd(r.DSI[1].angular, V3GetY(sz), V3Scale(r.DSI[2].angular, V3GetZ(sz)))));
+
+		aZ = V3Add(aZ, V3Cross(j.parentOffset, lZ));
+
+		lZ = V3Sub(impulse0.linear, lZ);
+		aZ = V3Sub(impulse0.angular, aZ);
+
+		dV0 = getImpulseResponseSimd(matrix, linkID0, lZ, aZ);
+
+		Vec3V aV = dV0.angular;
+		Vec3V lV = V3Sub(dV0.linear, V3Cross(j.parentOffset, aV));
+
+		Vec3V n = V3Add(V3Merge(V3Dot(r.DSI[0].linear, lV),  V3Dot(r.DSI[1].linear, lV),  V3Dot(r.DSI[2].linear, lV)),
+						V3Merge(V3Dot(r.DSI[0].angular, aV), V3Dot(r.DSI[1].angular, aV), V3Dot(r.DSI[2].angular, aV)));
+
+		n = V3Add(n, M33MulV3(r.D, sz));
+		lV = V3Sub(lV, V3Cross(j.jointOffset, n));
+		aV = V3Sub(aV, n);
+
+		dV1 = Cm::SpatialVectorV(lV, aV);
+	}
+	else
+		getImpulseResponseSlow(matrix, linkID0, impulse0, deltaV0, linkID1, impulse1, deltaV1);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVectorV dV0_, dV1_;
+	PxcFsGetImpulseSelfResponse(matrix, linkID0, impulse0, dV0_, linkID1, impulse1, dV1_);
+
+	PX_ASSERT(almostEqual(dV0_, dV0, 1e-3f));
+	PX_ASSERT(almostEqual(dV1_, dV1, 1e-3f));
+#endif
+}
+
+void PxcLtbComputeJv(Vec3V* jv, const FsData& m, const Cm::SpatialVectorV* velocity)
+{
+	const LtbRow* rows = getLtbRows(m);
+	const FsRow* fsRows = getFsRows(m);
+	const FsJointVectors* jointVectors = getJointVectors(m);
+
+	PX_UNUSED(rows);
+	PX_UNUSED(fsRows);
+
+	for(PxU32 i=1;i<m.linkCount;i++)
+	{
+		Cm::SpatialVectorV pv = velocity[m.parent[i]], v = velocity[i];
+
+		Vec3V parentOffset = V3Add(jointVectors[i].jointOffset, jointVectors[i].parentOffset);
+
+		Vec3V k0v = V3Add(pv.linear, V3Cross(pv.angular, parentOffset)),
+			  k1v = V3Add(v.linear,  V3Cross(v.angular,jointVectors[i].jointOffset));
+		jv[i] = V3Sub(k0v, k1v);
+	}
+}
+
+void ArticulationHelper::saveVelocity(const ArticulationSolverDesc& d)
+{
+	Vec3V b[DY_ARTICULATION_MAX_SIZE];
+	FsData& m = *d.fsData;
+
+	Cm::SpatialVectorV* velocity = getVelocity(m);
+	PxcFsFlushVelocity(m);
+
+	// save off the motion velocity
+
+	for(PxU32 i=0;i<m.linkCount;i++)
+	{
+		d.motionVelocity[i] = velocity[i];
+		PX_ASSERT(isFiniteVec3V(velocity[i].linear));
+		PX_ASSERT(isFiniteVec3V(velocity[i].angular));
+	}
+
+	// and now re-solve to use the unbiased velocities
+
+	PxcLtbComputeJv(b, m, velocity);
+	PxcLtbProject(m, velocity, b);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	for(PxU32 i=0;i<m.linkCount;i++)
+		getRefVelocity(m)[i] = velocity[i];
+#endif
+}
+
+void PxcFsComputeJointLoadsSimd(const FsData& matrix,
+								const FsInertia*PX_RESTRICT baseInertia,
+								Mat33V*PX_RESTRICT load,
+								const PxReal*PX_RESTRICT isf_,
+								PxU32 linkCount,
+								PxU32 maxIterations,
+								PxcFsScratchAllocator allocator)
+{
+	// dsequeira: this is really difficult to optimize on XBox: not inlining generates lots of LHSs, 
+	// inlining generates lots of cache misses because the fn is so huge (almost 2000 instrs.) 
+	// Timing says even for 1 iteration the cache misses are slighly preferable for a
+	// 20-bone articulation, for more iters it's *much* better to take the cache misses.
+	//
+	// about 400 instructions come from unnecessary and inexplicable branch checks
+
+	if(!maxIterations)
+		return;
+
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	FloatV isf[DY_ARTICULATION_MAX_SIZE];
+
+	for(PxU32 i=1;i<linkCount;i++)
+		isf[i] = FLoad(isf_[i]);
+
+	FsInertia*PX_RESTRICT inertia = allocator.alloc<FsInertia>(linkCount);
+	FsInertia*PX_RESTRICT contribToParent = allocator.alloc<FsInertia>(linkCount);
+
+	const FsRow*PX_RESTRICT row = getFsRows(matrix); 
+	const FsRowAux*PX_RESTRICT aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+	
+	PX_UNUSED(row);
+
+	// gets rid of about 200 LHSs, need to change the matrix format to make this part of it
+	PxU64 parent[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<linkCount;i++)
+		parent[i] = matrix.parent[i];
+
+	while(maxIterations--)
+	{
+		PxMemCopy(inertia, baseInertia, sizeof(FsInertia)*linkCount);
+			
+		for(PxU32 i=linkCount;i-->1;)
+		{
+			const Cm::SpatialVectorV*PX_RESTRICT S = aux[i].S;
+
+			Ps::prefetch(&load[i-1]);
+			Ps::prefetch(&jointVectors[i-1]);
+			const FsInertia tmp = Fns::propagate(inertia[i], S, load[i], isf[i]);
+			inertia[parent[i]] = Fns::addInertia(inertia[parent[i]], Fns::translateInertia(jointVectors[i].parentOffset, tmp));
+			contribToParent[i] = tmp;
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			const Cm::SpatialVectorV*PX_RESTRICT S = aux[i].S;
+
+			const FsInertia rootwardInertia = Fns::subtractInertia(Fns::translateInertia(V3Neg(jointVectors[i].parentOffset), inertia[parent[i]]), contribToParent[i]);
+			const FsInertia tmp = Fns::propagate(rootwardInertia, S, load[i], isf[i]);
+			load[i] = Fns::computeDriveInertia(inertia[i], rootwardInertia, S);
+			inertia[i] = Fns::addInertia(inertia[i], tmp);
+		}
+	}
+}
+
+PxU32 ArticulationHelper::getFsDataSize(PxU32 linkCount)
+{
+	return sizeof(FsInertia) + sizeof(FsRow) * linkCount;
+}
+
+PxU32 ArticulationHelper::getLtbDataSize(PxU32 linkCount)
+{
+	return sizeof(LtbRow) * linkCount;
+}
+
+void ArticulationHelper::prepareDataBlock(	FsData& fsData,
+										    const ArticulationLink* links, 
+											PxU16 linkCount,
+											PxTransform* poses,
+										 	FsInertia* baseInertia,
+											ArticulationJointTransforms* jointTransforms,
+											PxU32 expectedSize)
+{
+	PxU32 stateSize = sizeof(FsData)
+					+ sizeof(Cm::SpatialVectorV) * linkCount
+					+ sizeof(Cm::SpatialVectorV) * linkCount
+					+ sizeof(Vec3V)			 * linkCount
+					+ sizeof(PxReal)		 * ((linkCount + 15) & 0xfffffff0);
+
+	PxU32 jointVectorSize = sizeof(FsJointVectors) * linkCount;
+
+	PxU32 fsDataSize  = getFsDataSize(linkCount);
+	PxU32 ltbDataSize = getLtbDataSize(linkCount);
+
+	PxU32 totalSize	= stateSize 
+					+ jointVectorSize
+					+ fsDataSize 
+					+ ltbDataSize
+					+ sizeof(Cm::SpatialVectorV) * linkCount
+					+ sizeof(FsRowAux)    * linkCount;
+
+	PX_UNUSED(totalSize);
+	PX_UNUSED(expectedSize);
+	PX_ASSERT(expectedSize == 0 || totalSize == expectedSize);
+
+	PxMemZero(&fsData, stateSize);
+	fsData.jointVectorOffset	= PxU16(stateSize);
+	fsData.fsDataOffset			= PxU16(stateSize+jointVectorSize);
+	fsData.ltbDataOffset		= PxU16(stateSize+jointVectorSize+fsDataSize);
+	fsData.linkCount			= linkCount;
+
+	for(PxU32 i=1;i<linkCount;i++)
+		fsData.parent[i] = PxU8(links[i].parent);
+	fsData.deferredZ = Cm::SpatialVectorV(PxZero);
+
+	Cm::SpatialVector* velocity = reinterpret_cast<Cm::SpatialVector*>(getVelocity(fsData));
+
+	PxMemZero(baseInertia, sizeof(FsInertia)*linkCount);
+
+	PxReal* maxPenBias = getMaxPenBias(fsData);
+
+	for(PxU32 i=0;i<linkCount;i++)
+	{
+		if((i+2)<linkCount)
+		{
+			Ps::prefetch(links[i+2].bodyCore);
+			Ps::prefetch(links[i+2].inboundJoint);
+		}
+		PxsBodyCore& core = *links[i].bodyCore;
+		poses[i] = core.body2World;
+		velocity[i] = Cm::SpatialVector(core.linearVelocity, core.angularVelocity);
+		setInertia(baseInertia[i], core, core.body2World);
+		maxPenBias[i] = core.maxPenBias;
+
+		if(i)
+			setJointTransforms(jointTransforms[i], poses[links[i].parent], core.body2World, *links[i].inboundJoint);
+	}
+
+	FsJointVectors* jointVectors = getJointVectors(fsData);
+	for(PxU32 i=1;i<linkCount;i++)
+	{
+		PX_ALIGN(16, PxVec3) parentOffset = poses[i].p - poses[fsData.parent[i]].p;
+		PX_ALIGN(16, PxVec3) jointOffset = jointTransforms[i].cB2w.p - poses[i].p;
+		jointVectors[i].parentOffset = V3LoadA(parentOffset);
+		jointVectors[i].jointOffset = V3LoadA(jointOffset);
+	}
+}
+
+PxU32 ArticulationHelper::computeUnconstrainedVelocities(	const ArticulationSolverDesc& desc,
+															PxReal dt,
+															PxcConstraintBlockStream& stream,
+															PxSolverConstraintDesc* constraintDesc,
+															PxU32& acCount,
+															PxsConstraintBlockManager& constraintBlockManager,
+															const PxVec3& gravity, PxU64 contextID)
+{
+	PX_UNUSED(contextID);
+	const ArticulationLink* links = desc.links;
+	PxU16 linkCount = desc.linkCount;
+	FsData& fsData = *desc.fsData;
+	PxTransform* poses = desc.poses;
+
+	PxcFsScratchAllocator allocator(desc.scratchMemory, desc.scratchMemorySize);
+	FsInertia*						PX_RESTRICT baseInertia = allocator.alloc<FsInertia>(desc.linkCount);
+	ArticulationJointTransforms*	PX_RESTRICT jointTransforms = allocator.alloc<ArticulationJointTransforms>(desc.linkCount);
+
+	{
+		PX_PROFILE_ZONE("Articulations.prepareDataBlock", contextID);
+		prepareDataBlock(fsData, links, linkCount, poses, baseInertia, jointTransforms, desc.totalDataSize);
+	}
+
+	const PxReal recipDt = 1.0f/dt;
+
+	Cm::SpatialVectorV* velocity = getVelocity(fsData);
+
+	{
+
+		PX_PROFILE_ZONE("Articulations.setupProject", contextID);
+
+		PxMemZero(getLtbRows(fsData), getLtbDataSize(linkCount));
+		prepareLtbMatrix(fsData, baseInertia, poses, jointTransforms, recipDt);
+
+		PxcLtbFactor(fsData);
+	
+		Vec3V b[DY_ARTICULATION_MAX_SIZE];
+		PxcLtbComputeJv(b, fsData, velocity);
+
+		LtbRow* rows = getLtbRows(fsData);
+		for(PxU32 i=1;i<linkCount;i++)
+			b[i] = V3Add(b[i], rows[i].jC);
+
+		PxcLtbProject(fsData, velocity, b);
+	}
+
+	{
+		PX_PROFILE_ZONE("Articulations.prepareFsData", contextID);
+		PxMemZero(addAddr<void*>(&fsData,fsData.fsDataOffset), getFsDataSize(linkCount));
+		prepareFsData(fsData, links);
+	}
+
+	{
+		PX_PROFILE_ZONE("Articulations.setupDrives", contextID);
+	
+		if(!(desc.core->externalDriveIterations & 0x80000000))
+			PxMemZero(desc.externalLoads, sizeof(Mat33V) * linkCount);
+
+		if(!(desc.core->internalDriveIterations & 0x80000000))
+			PxMemZero(desc.internalLoads, sizeof(Mat33V) * linkCount);
+
+		PxReal				isf[DY_ARTICULATION_MAX_SIZE], esf[DY_ARTICULATION_MAX_SIZE];			// spring factors
+		Vec3V				drive[DY_ARTICULATION_MAX_SIZE];
+
+		bool externalEqualsInternalCompliance = (desc.core->internalDriveIterations&0xffff) == (desc.core->externalDriveIterations&0xffff);
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			const ArticulationJointCore& j = *links[i].inboundJoint;
+			isf[i] = (1 + j.damping * dt + j.spring * dt * dt) * getResistance(j.internalCompliance);
+			esf[i] = (1 + j.damping * dt + j.spring * dt * dt) * getResistance(j.externalCompliance);
+
+			externalEqualsInternalCompliance = externalEqualsInternalCompliance && j.internalCompliance == j.externalCompliance;
+		}
+
+		{
+			PX_PROFILE_ZONE("Articulations.jointInternalLoads", contextID);
+			PxcFsComputeJointLoadsSimd(fsData, baseInertia, desc.internalLoads, isf, linkCount, desc.core->internalDriveIterations&0xffff, allocator);
+			
+		}
+
+		{
+			PX_PROFILE_ZONE("Articulations.propagateDrivenInertia", contextID);
+			PxcFsPropagateDrivenInertiaSimd(fsData, baseInertia, isf, desc.internalLoads, allocator);
+		}
+
+		{
+			PX_PROFILE_ZONE("Articulations.computeJointDrives", contextID);
+			computeJointDrives(fsData, drive, links, poses, jointTransforms, desc.internalLoads, dt);
+		}
+
+		{
+			PX_PROFILE_ZONE("Articulations.applyJointDrives", contextID);
+			PxcFsApplyJointDrives(fsData, drive);
+		}
+
+		if(!externalEqualsInternalCompliance)
+		{
+			{
+				PX_PROFILE_ZONE("Articulations.jointExternalLoads", contextID);
+				PxcFsComputeJointLoadsSimd(fsData, baseInertia, desc.externalLoads, esf, linkCount, desc.core->externalDriveIterations&0xffff, allocator);
+			}
+
+			{
+				PX_PROFILE_ZONE("Articulations.propagateDrivenInertia", contextID);
+				PxcFsPropagateDrivenInertiaSimd(fsData, baseInertia, esf, desc.externalLoads, allocator);
+			}
+		}
+	}
+
+	{
+		PX_PROFILE_ZONE("Articulations.applyExternalImpulses", contextID);
+		Cm::SpatialVectorV	Z[DY_ARTICULATION_MAX_SIZE];
+
+		FloatV h = FLoad(dt);
+
+		const Cm::SpatialVector* acceleration = desc.acceleration;
+
+		const Vec3V vGravity = V3LoadU(gravity);
+
+		for(PxU32 i=0;i<linkCount;i++)
+		{
+			Vec3V linearAccel = V3LoadA(acceleration[i].linear);
+
+			if (!(desc.links[i].body->mInternalFlags & PxcRigidBody::eDISABLE_GRAVITY))
+				linearAccel = V3Add(linearAccel, vGravity);
+			Cm::SpatialVectorV a(linearAccel, V3LoadA(acceleration[i].angular));
+			Z[i] = -ArticulationFnsSimd<ArticulationFnsSimdBase>::multiply(baseInertia[i], a) * h;
+		}
+
+		applyImpulses(fsData, Z, getVelocity(fsData));
+	}
+
+	// save off the motion velocity in case there are no constraints with the articulation
+
+	PxMemCopy(desc.motionVelocity, velocity, linkCount*sizeof(Cm::SpatialVectorV));
+
+	// set up for deferred-update solve
+	
+	fsData.dirty = 0;
+
+	// solver progress counters
+	fsData.maxSolverNormalProgress		= 0;
+	fsData.maxSolverFrictionProgress	= 0;
+	fsData.solverProgress				= 0;
+
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	for(PxU32 i=0;i<linkCount;i++)
+		getRefVelocity(fsData)[i] = getVelocity(fsData)[i];
+#endif
+
+	{
+		PX_PROFILE_ZONE("Articulations.setupConstraints", contextID);
+		return setupSolverConstraints(fsData, desc.solverDataSize, stream, constraintDesc, links, jointTransforms, dt, acCount, constraintBlockManager);
+	}
+}
+
+void ArticulationHelper::initializeDriveCache(	FsData& fsData,
+												PxU16 linkCount,
+												const ArticulationLink* links,
+												PxReal compliance,
+												PxU32 iterations,
+												char* scratchMemory,
+												PxU32 scratchMemorySize)
+{
+	PxcFsScratchAllocator allocator(scratchMemory, scratchMemorySize);
+	FsInertia*						PX_RESTRICT baseInertia = allocator.alloc<FsInertia>(linkCount);
+	ArticulationJointTransforms*	PX_RESTRICT jointTransforms = allocator.alloc<ArticulationJointTransforms>(linkCount);
+	PxTransform*					PX_RESTRICT poses = allocator.alloc<PxTransform>(linkCount);
+	Mat33V*							PX_RESTRICT jointLoads = allocator.alloc<Mat33V>(linkCount);
+
+	PxReal								springFactor[DY_ARTICULATION_MAX_SIZE];			// spring factors
+
+	prepareDataBlock(fsData, links, linkCount, poses, baseInertia, jointTransforms, 0);
+
+	PxMemZero(addAddr<void*>(&fsData,fsData.fsDataOffset), getFsDataSize(linkCount));
+	prepareFsData(fsData, links);
+
+	springFactor[0] = 0.0f;
+	for(PxU32 i=1;i<linkCount;i++)
+		springFactor[i] = getResistance(compliance);
+
+	PxMemZero(jointLoads, sizeof(Mat33V)*linkCount);
+	PxcFsComputeJointLoadsSimd(fsData, baseInertia, jointLoads, springFactor, linkCount, iterations&0xffff, allocator);
+	PxcFsPropagateDrivenInertiaSimd(fsData, baseInertia, springFactor, jointLoads, allocator);
+}
+
+void ArticulationHelper::updateBodies(const ArticulationSolverDesc& desc, PxReal dt)
+{
+	FsData& fsData = *desc.fsData;
+	const ArticulationCore& core = *desc.core;
+	const ArticulationLink* links = desc.links;
+	PxTransform* poses = desc.poses;
+	Cm::SpatialVectorV* motionVelocity = desc.motionVelocity;
+
+	Vec3V b[DY_ARTICULATION_MAX_SIZE];
+	
+	PxU32 linkCount = fsData.linkCount;
+
+	PxcFsFlushVelocity(fsData);
+	PxcLtbComputeJv(b, fsData, getVelocity(fsData));
+	PxcLtbProject(fsData, getVelocity(fsData), b);
+
+	// update positions
+	PxcFsScratchAllocator allocator(desc.scratchMemory, desc.scratchMemorySize);
+	PxTransform*		PX_RESTRICT oldPose = allocator.alloc<PxTransform>(desc.linkCount);
+
+	for(PxU32 i=0;i<linkCount;i++)
+	{
+		const PxVec3& lv = reinterpret_cast<PxVec3&>(motionVelocity[i].linear);
+		const PxVec3& av = reinterpret_cast<PxVec3&>(motionVelocity[i].angular);
+		oldPose[i] = poses[i];
+		poses[i] = PxTransform(poses[i].p + lv * dt, Ps::exp(av*dt) * poses[i].q);
+	}
+
+	bool projected = false;
+	const PxReal recipDt = 1.0f/dt;
+
+	FsInertia*						PX_RESTRICT baseInertia = allocator.alloc<FsInertia>(desc.linkCount);
+	ArticulationJointTransforms*	PX_RESTRICT jointTransforms = allocator.alloc<ArticulationJointTransforms>(desc.linkCount);
+
+	for(PxU32 iterations = 0; iterations < core.maxProjectionIterations; iterations++)
+	{
+		PxReal maxSeparation = -PX_MAX_F32;
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			const ArticulationJointCore& j = *links[i].inboundJoint;
+			maxSeparation = PxMax(maxSeparation,
+								  (poses[links[i].parent].transform(j.parentPose).p -
+								   poses[i].transform(j.childPose).p).magnitude());
+		}
+
+		if(maxSeparation<=core.separationTolerance)
+			break;
+
+		projected = true;
+
+		// we go around again, finding velocities which pull us back together - this
+		// form of projection is momentum-preserving but slow compared to hierarchical
+		// projection
+
+		PxMemZero(baseInertia, sizeof(FsInertia)*linkCount);
+
+		ArticulationHelper::setInertia(baseInertia[0], *links[0].bodyCore, poses[0]);
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			ArticulationHelper::setInertia(baseInertia[i], *links[i].bodyCore, poses[i]);
+			ArticulationHelper::setJointTransforms(jointTransforms[i], poses[links[i].parent], poses[i], *links[i].inboundJoint);
+		}
+
+		ArticulationHelper::prepareLtbMatrix(fsData, baseInertia, poses, jointTransforms, recipDt);
+		PxcLtbFactor(fsData);
+
+		LtbRow* rows = getLtbRows(fsData);
+
+		for(PxU32 i=1;i<linkCount;i++)
+			b[i] = rows[i].jC;
+
+		PxMemZero(motionVelocity, linkCount*sizeof(Cm::SpatialVectorV));
+
+		PxcLtbProject(fsData, motionVelocity, b);
+
+		for(PxU32 i=0;i<linkCount;i++)
+		{
+			const PxVec3& lv = reinterpret_cast<PxVec3&>(motionVelocity[i].linear);
+			const PxVec3& av = reinterpret_cast<PxVec3&>(motionVelocity[i].angular);
+			poses[i] = PxTransform(poses[i].p + lv * dt,  Ps::exp(av*dt) * poses[i].q);
+		}
+	}
+
+	if(projected)
+	{
+		// recompute motion velocities.
+		for(PxU32 i=0;i<linkCount;i++)
+		{
+			motionVelocity[i].linear = V3LoadU((poses[i].p - oldPose[i].p) * recipDt);
+			motionVelocity[i].angular = V3LoadU(Ps::log(poses[i].q * oldPose[i].q.getConjugate()) * recipDt);
+		}
+	}
+
+	Cm::SpatialVectorV* velocity = getVelocity(fsData);
+	for(PxU32 i=0;i<linkCount;i++)
+	{
+		links[i].bodyCore->body2World = poses[i];
+
+		V3StoreA(velocity[i].linear,  links[i].bodyCore->linearVelocity);
+		V3StoreA(velocity[i].angular, links[i].bodyCore->angularVelocity);
+	}
+}
+
+void ArticulationHelper::setInertia(FsInertia& inertia,
+									const PxsBodyCore& body,
+									const PxTransform& pose)
+{
+	// assumes that elements that are supposed to be zero (i.e. la matrix and off diagonal elements of ll) are zero
+
+	const PxMat33 R(pose.q);
+	const PxVec3& v = body.inverseInertia;
+	const PxReal m = 1.0f/body.inverseMass;
+	V3WriteX(inertia.ll.col0, m);
+	V3WriteY(inertia.ll.col1, m);
+	V3WriteZ(inertia.ll.col2, m);
+
+	PX_ALIGN_PREFIX(16) PxMat33 PX_ALIGN_SUFFIX(16) alignedInertia = R * PxMat33::createDiagonal(PxVec3(1.0f/v.x, 1.0f/v.y, 1.0f/v.z)) * R.getTranspose();
+	alignedInertia = (alignedInertia + alignedInertia.getTranspose())*0.5f;
+	inertia.aa = Mat33V_From_PxMat33(alignedInertia);
+}
+
+void ArticulationHelper::setJointTransforms(ArticulationJointTransforms& transforms,
+											const PxTransform& parentPose,
+											const PxTransform& childPose,
+											const ArticulationJointCore& joint)
+{
+	transforms.cA2w = parentPose.transform(joint.parentPose);
+	transforms.cB2w = childPose.transform(joint.childPose);
+	transforms.cB2cA = transforms.cA2w.transformInv(transforms.cB2w);
+	if(transforms.cB2cA.q.w<0)	// the relative quat must be the short way round for limits to work...
+	{
+		transforms.cB2cA.q	= -transforms.cB2cA.q;
+		transforms.cB2w.q	= -transforms.cB2w.q;
+	}
+}
+
+void ArticulationHelper::prepareLtbMatrix(	FsData& fsData,
+											const FsInertia* baseInertia,
+											const PxTransform* poses,
+											const ArticulationJointTransforms* jointTransforms,
+											PxReal recipDt)
+{
+	PxU32 linkCount = fsData.linkCount;
+	LtbRow* rows = getLtbRows(fsData);
+
+	rows[0].inertia = baseInertia[0];
+
+	const PxVec3 axis[3] = { PxVec3(1.0f,0.0f,0.0f), PxVec3(0.0f,1.0f,0.0f), PxVec3(0.0f,0.0f,1.0f) };
+	for(PxU32 i=1;i<linkCount;i++)
+	{
+		rows[i].inertia = baseInertia[i];
+		const ArticulationJointTransforms& s = jointTransforms[i];
+
+		const PxU32 p = fsData.parent[i];
+
+		// we put the action point of the constraint at the root of the child
+
+		const PxVec3 ra = s.cB2w.p - poses[p].p;
+		const PxVec3 rb = s.cB2w.p - poses[i].p;
+
+		// A bit different from the 1D solver, 
+		// there we use a formulation	j0.v0 - j1.v1 + c = 0
+		// here we use the homogeneous	j0.v0 + j1.v1 + c = 0
+		
+		const PxVec3 error = (s.cA2w.p - s.cB2w.p) * 0.99f;
+
+		Cm::SpatialVectorV* j0 = rows[i].j0;
+		Cm::SpatialVectorV* j1 = rows[i].j1;
+
+		for(PxU32 j=0;j<3;j++)
+		{
+			PxVec3 n = axis[j];
+			j0[j] = Cm::SpatialVector(n, ra.cross(n));
+			j1[j] = Cm::SpatialVector(-n, -rb.cross(n));
+		}
+
+		rows[i].jC = V3LoadU(error*recipDt);
+	}
+}
+
+void ArticulationHelper::prepareFsData(FsData& fsData, const ArticulationLink* links)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	PxU32 linkCount = fsData.linkCount;
+	FsRow* rows = getFsRows(fsData);
+	FsRowAux* aux = getAux(fsData);
+	const FsJointVectors* jointVectors = getJointVectors(fsData);
+
+	rows[0].children = links[0].children;
+	rows[0].pathToRoot = 1;
+
+	PX_ALIGN_PREFIX(16) PxVec4 v[] PX_ALIGN_SUFFIX(16) = { PxVec4(1.f,0,0,0), PxVec4(0,1.f,0,0), PxVec4(0,0,1.f,0) } ;
+	const Vec3V* axes = reinterpret_cast<const Vec3V*>(v);
+
+	for(PxU32 i=1;i<linkCount;i++)
+	{
+		PxU32 p = links[i].parent;
+		FsRow& r = rows[i];
+		FsRowAux& a = aux[i];
+
+		PX_UNUSED(p);
+
+		r.children = links[i].children;
+		r.pathToRoot = links[i].pathToRoot;
+		
+		const Vec3V jointOffset =	jointVectors[i].jointOffset;
+		
+		// the joint coords are world oriented, located at the joint.
+		a.S[0] = Fns::translateMotion(jointOffset, Cm::SpatialVectorV(V3Zero(), axes[0]));
+		a.S[1] = Fns::translateMotion(jointOffset, Cm::SpatialVectorV(V3Zero(), axes[1]));
+		a.S[2] = Fns::translateMotion(jointOffset, Cm::SpatialVectorV(V3Zero(), axes[2]));
+	}
+}
+
+PX_FORCE_INLINE PxReal ArticulationHelper::getResistance(PxReal compliance)
+{
+	PX_ASSERT(compliance>0);
+	return 1.0f/compliance;
+}
+
+void ArticulationHelper::createHardLimit(	const FsData& fsData,
+											const ArticulationLink* links,
+											PxU32 linkIndex,
+											SolverConstraint1DExt& s, 
+											const PxVec3& axis, 
+											PxReal err,
+											PxReal recipDt)
+{
+	init(s, PxVec3(0), PxVec3(0), axis, axis, 0, PX_MAX_F32);
+
+	ArticulationHelper::getImpulseSelfResponse(fsData, 
+												  links[linkIndex].parent,Cm::SpatialVector(PxVec3(0), axis), s.deltaVA,
+												  linkIndex, Cm::SpatialVector(PxVec3(0), -axis), s.deltaVB);
+
+	const PxReal unitResponse = axis.dot(reinterpret_cast<PxVec3&>(s.deltaVA.angular)) - axis.dot(reinterpret_cast<PxVec3&>(s.deltaVB.angular));
+	if(unitResponse<0.0f)
+		Ps::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, "Warning: articulation ill-conditioned or under severe stress, joint limit ignored");
+
+	const PxReal recipResponse = unitResponse>0.0f ? 1.0f/unitResponse : 0.0f;
+
+	s.constant = recipResponse * -err * recipDt;
+	s.unbiasedConstant = err>0.0f ? s.constant : 0.0f;
+	s.velMultiplier = -recipResponse;
+	s.impulseMultiplier = 1.0f;
+}
+
+void ArticulationHelper::createTangentialSpring(const FsData& fsData,
+												const ArticulationLink* links,
+												PxU32 linkIndex,
+												SolverConstraint1DExt& s, 
+												const PxVec3& axis, 
+												PxReal stiffness,
+												PxReal damping,
+												PxReal dt)
+{
+	init(s, PxVec3(0), PxVec3(0), axis, axis, -PX_MAX_F32, PX_MAX_F32);
+
+	Cm::SpatialVector axis6(PxVec3(0), axis);
+	PxU32 parent = links[linkIndex].parent;
+	getImpulseSelfResponse(fsData, parent, axis6, s.deltaVA, linkIndex, -axis6, s.deltaVB);
+
+	const PxReal unitResponse = axis.dot(reinterpret_cast<PxVec3&>(s.deltaVA.angular)) - axis.dot(reinterpret_cast<PxVec3&>(s.deltaVB.angular));
+	if(unitResponse<0.0f)
+		Ps::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, "Warning: articulation ill-conditioned or under severe stress, tangential spring ignored");
+	const PxReal recipResponse = unitResponse>0.0F ? 1.0f/unitResponse : 0.0f;
+
+	// this is a specialization of the spring code in setSolverConstants() for acceleration springs.
+	// general case is  b = dt * (c.mods.spring.damping * c.velocityTarget - c.mods.spring.stiffness * geomError);
+    // but geomError and velocityTarget are both zero
+
+	const PxReal a = dt * dt * stiffness + dt * damping;
+    const PxReal x = 1.0f/(1.0f+a);
+    s.constant = s.unbiasedConstant = 0.0f;
+    s.velMultiplier = -x * recipResponse * a;
+    s.impulseMultiplier = 1.0f - x;
+}
+
+PxU32 ArticulationHelper::setupSolverConstraints(	FsData& fsData, PxU32 solverDataSize,
+													PxcConstraintBlockStream& stream,
+													PxSolverConstraintDesc* constraintDesc,
+													const ArticulationLink* links,
+													const ArticulationJointTransforms* jointTransforms,
+													PxReal dt,
+													PxU32& acCount,
+													PxsConstraintBlockManager& constraintBlockManager)
+{
+	acCount = 0;
+
+	const PxU16 linkCount = fsData.linkCount;
+	PxU32 descCount = 0;
+	const PxReal recipDt = 1.0f/dt;
+
+	const PxConstraintInvMassScale ims(1.0f, 1.0f, 1.0f, 1.0f);
+
+	for(PxU16 i=1;i<linkCount;i++)
+	{
+		const ArticulationJointCore& j = *links[i].inboundJoint;
+
+		if(i+1<linkCount)
+		{
+			Ps::prefetch(links[i+1].inboundJoint, sizeof (ArticulationJointCore));
+			Ps::prefetch(&jointTransforms[i+1], sizeof(ArticulationJointTransforms));
+		}
+		
+		if(!(j.twistLimited || j.swingLimited))
+			continue;
+
+		PxQuat swing, twist;
+		Ps::separateSwingTwist(jointTransforms[i].cB2cA.q, swing, twist);
+	
+		Cm::ConeLimitHelper eh(j.tanQSwingY, j.tanQSwingZ, j.tanQSwingPad);
+		PxVec3 swingLimitAxis;
+		PxReal swingLimitError = 0.0f;
+
+		const bool swingLimited = j.swingLimited && eh.getLimit(swing, swingLimitAxis, swingLimitError);
+		const bool tangentialStiffness = swingLimited && (j.tangentialStiffness>0 || j.tangentialDamping>0);
+
+		const PxVec3 twistAxis = jointTransforms[i].cB2w.rotate(PxVec3(1.0f,0,0));
+		const PxReal tqTwistAngle = Ps::tanHalf(twist.x, twist.w);
+
+		const bool twistLowerLimited = j.twistLimited && tqTwistAngle < Cm::tanAdd(j.tanQTwistLow, j.tanQTwistPad);
+		const bool twistUpperLimited = j.twistLimited && tqTwistAngle > Cm::tanAdd(j.tanQTwistHigh, -j.tanQTwistPad);
+	
+		const PxU8 constraintCount = PxU8(swingLimited + tangentialStiffness + twistUpperLimited + twistLowerLimited);
+		if(!constraintCount)
+			continue;
+
+		PxSolverConstraintDesc& desc = constraintDesc[descCount++];
+
+		desc.articulationA = &fsData;
+		desc.linkIndexA = Ps::to16(links[i].parent);
+		desc.articulationALength = Ps::to16(solverDataSize);
+
+		desc.articulationB = &fsData;
+		desc.linkIndexB = i;
+		desc.articulationBLength = Ps::to16(solverDataSize);
+
+		const PxU32 constraintLength = sizeof(SolverConstraint1DHeader) + 
+								 sizeof(SolverConstraint1DExt) * constraintCount;
+
+		PX_ASSERT(0==(constraintLength & 0x0f));
+		desc.constraintLengthOver16 = Ps::to16(constraintLength/16);
+		
+		desc.constraint = stream.reserve(constraintLength + 16u, constraintBlockManager);
+
+		desc.writeBack = NULL;
+		
+		SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
+		SolverConstraint1DExt* constraints = reinterpret_cast<SolverConstraint1DExt*>(desc.constraint + sizeof(SolverConstraint1DHeader));
+
+		init(*header, constraintCount, true, ims);
+
+		PxU32 cIndex = 0;
+
+		if(swingLimited)
+		{
+			const PxVec3 normal = jointTransforms[i].cA2w.rotate(swingLimitAxis);
+			createHardLimit(fsData, links, i, constraints[cIndex++], normal, swingLimitError, recipDt);
+			if(tangentialStiffness)
+			{
+				const PxVec3 tangent = twistAxis.cross(normal).getNormalized();
+				createTangentialSpring(fsData, links, i, constraints[cIndex++], tangent, j.tangentialStiffness, j.tangentialDamping, dt);
+			}
+		}
+
+		if(twistUpperLimited)
+			createHardLimit(fsData, links, i, constraints[cIndex++], twistAxis, (j.tanQTwistHigh - tqTwistAngle)*4, recipDt);
+
+		if(twistLowerLimited)
+			createHardLimit(fsData, links, i, constraints[cIndex++], -twistAxis, -(j.tanQTwistLow - tqTwistAngle)*4, recipDt);
+
+		*(desc.constraint + getConstraintLength(desc)) = 0;
+
+		PX_ASSERT(cIndex == constraintCount);
+		acCount += constraintCount;
+	}
+
+	return descCount;
+}
+
+void ArticulationHelper::computeJointDrives(FsData& fsData,
+											Vec3V* drives, 
+											const ArticulationLink* links,
+											const PxTransform* poses, 
+											const ArticulationJointTransforms* transforms, 
+											const Mat33V* loads, 
+											PxReal dt)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	const PxU32 linkCount = fsData.linkCount;
+	const Cm::SpatialVector* velocity = reinterpret_cast<const Cm::SpatialVector*>(getVelocity(fsData));
+
+	for(PxU32 i=1; i<linkCount;i++)
+	{
+		PxU32 parent = links[i].parent;
+		const ArticulationJointTransforms& b = transforms[i];
+		const ArticulationJointCore& j = *links[i].inboundJoint;
+
+		const Cm::SpatialVector currentVel = Fns::translateMotion(poses[i].p - b.cA2w.p, velocity[i])
+											 - Fns::translateMotion(poses[parent].p - b.cA2w.p, velocity[parent]);
+
+		// we want the quat such that q * cB2cA = targetPosition
+		PxVec3 rotVec;
+		if(j.driveType == PxU8(PxArticulationJointDriveType::eTARGET))
+			rotVec = Ps::log(j.targetPosition * b.cB2cA.q.getConjugate()); // as a rotation vector
+		else
+			rotVec = j.targetPosition.getImaginaryPart();
+
+		// NM's Tests indicate behavior is better without the term commented out below, even though
+		// an implicit spring derivation suggests it should be there.
+
+		const PxVec3 posError = b.cA2w.rotate(rotVec); // - currentVel.angular * 0.5f * dt
+		const PxVec3 velError = b.cA2w.rotate(j.targetVelocity) - currentVel.angular;
+
+		drives[i] = M33MulV3(loads[i], V3LoadU((j.spring * posError + j.damping * velError) * dt * getResistance(j.internalCompliance)));
+	}
+}
+
+ArticulationPImpl::ComputeUnconstrainedVelocitiesFn ArticulationPImpl::sComputeUnconstrainedVelocities = NULL;
+ArticulationPImpl::UpdateBodiesFn ArticulationPImpl::sUpdateBodies = NULL;
+ArticulationPImpl::SaveVelocityFn ArticulationPImpl::sSaveVelocity = NULL;
+
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationHelper.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationHelper.h
new file mode 100644
index 00000000..1c2b28b7
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationHelper.h
@@ -0,0 +1,192 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_ARTICULATION_HELPER_H
+#define DY_ARTICULATION_HELPER_H
+
+
+#include "DyArticulation.h"
+
+namespace physx
+{
+struct PxsBodyCore;
+
+class PxcConstraintBlockStream;
+class PxcRigidBody;
+class PxsConstraintBlockManager;
+struct PxSolverConstraintDesc;
+
+namespace Dy
+{
+	struct FsInertia;
+	struct SolverConstraint1DExt;
+	struct ArticulationJointCore;
+	struct ArticulationSolverDesc;
+
+
+struct ArticulationJointTransforms
+{
+	PxTransform		cA2w;				// joint parent frame in world space 
+	PxTransform		cB2w;				// joint child frame in world space
+	PxTransform		cB2cA;				// joint relative pose in world space
+};
+
+class ArticulationHelper
+{
+public:
+	static PxU32	computeUnconstrainedVelocities(const ArticulationSolverDesc& desc,
+												   PxReal dt,
+												   PxcConstraintBlockStream& stream,
+												   PxSolverConstraintDesc* constraintDesc,
+												   PxU32& acCount,
+												   PxsConstraintBlockManager& constraintBlockManager,
+												   const PxVec3& gravity, PxU64 contextID);
+
+	static void		updateBodies(const ArticulationSolverDesc& desc,
+							 	 PxReal dt);
+
+
+	static void		getImpulseResponse(const FsData& matrix, 
+									   PxU32 linkID, 
+									   const Cm::SpatialVectorV& impulse,
+									   Cm::SpatialVectorV& deltaV);
+
+
+	static PX_FORCE_INLINE 
+			void	getImpulseResponse(const FsData& matrix, 
+									   PxU32 linkID, 
+									   const Cm::SpatialVector& impulse,
+									   Cm::SpatialVector& deltaV)
+	{
+		getImpulseResponse(matrix, linkID, reinterpret_cast<const Cm::SpatialVectorV&>(impulse), reinterpret_cast<Cm::SpatialVectorV&>(deltaV));
+	}
+
+	static void		getImpulseSelfResponse(const FsData& matrix, 
+										   PxU32 linkID0, 
+										   const Cm::SpatialVectorV& impulse0,
+										   Cm::SpatialVectorV& deltaV0,
+										   PxU32 linkID1,
+										   const Cm::SpatialVectorV& impulse1,
+										   Cm::SpatialVectorV& deltaV1);
+
+	static void		flushVelocity(FsData& matrix);
+
+	static void		saveVelocity(const ArticulationSolverDesc& m);
+
+	static void		getDataSizes(PxU32 linkCount, PxU32 &solverDataSize, PxU32& totalSize, PxU32& scratchSize);
+
+	static void		initializeDriveCache(FsData &data,
+										 PxU16 linkCount,
+										 const ArticulationLink* links,
+										 PxReal compliance,
+										 PxU32 iterations,
+										 char* scratchMemory,
+										 PxU32 scratchMemorySize);
+
+	static PxU32	getDriveCacheLinkCount(const FsData& cache);
+
+	static void		applyImpulses(const FsData& matrix,
+								  Cm::SpatialVectorV* Z,
+								  Cm::SpatialVectorV* V);
+
+private:
+	static PxU32	getLtbDataSize(PxU32 linkCount);
+	static PxU32	getFsDataSize(PxU32 linkCount);
+
+	static void		prepareDataBlock(FsData& fsData,
+									 const ArticulationLink* links,
+									 PxU16 linkCount,	
+									 PxTransform* poses,
+								 	 FsInertia *baseInertia,
+									 ArticulationJointTransforms* jointTransforms,
+									 PxU32 expectedSize);
+
+	static void		setInertia(FsInertia& inertia,
+							   const PxsBodyCore& body,
+							   const PxTransform& pose);
+
+	static void		setJointTransforms(ArticulationJointTransforms& transforms,
+									   const PxTransform& parentPose,
+									   const PxTransform& childPose,
+									   const ArticulationJointCore& joint);
+
+	static void		prepareLtbMatrix(FsData& fsData,
+									 const FsInertia* baseInertia,
+									 const PxTransform* poses,
+									 const ArticulationJointTransforms* jointTransforms,
+									 PxReal recipDt);
+
+	static void		prepareFsData(FsData& fsData,
+								  const ArticulationLink* links);
+
+	static PX_FORCE_INLINE PxReal getResistance(PxReal compliance);
+
+
+	static void		createHardLimit(const FsData& fsData,
+									const ArticulationLink* links,
+									PxU32 linkIndex,
+									SolverConstraint1DExt& s, 
+									const PxVec3& axis, 
+									PxReal err,
+									PxReal recipDt);
+
+	static void		createTangentialSpring(const FsData& fsData,
+										   const ArticulationLink* links,
+										   PxU32 linkIndex,
+										   SolverConstraint1DExt& s, 
+										   const PxVec3& axis, 
+										   PxReal stiffness,
+										   PxReal damping,
+										   PxReal dt);
+
+	static PxU32 setupSolverConstraints(FsData& fsData, PxU32 solverDataSize,
+													PxcConstraintBlockStream& stream,
+													PxSolverConstraintDesc* constraintDesc,
+													const ArticulationLink* links,
+													const ArticulationJointTransforms* jointTransforms,
+													PxReal dt,
+													PxU32& acCount,
+													PxsConstraintBlockManager& constraintBlockManager);
+
+	static void		computeJointDrives(FsData& fsData,
+									   Ps::aos::Vec3V* drives, 
+									   const ArticulationLink* links,
+									   const PxTransform* poses, 
+									   const ArticulationJointTransforms* transforms, 
+									   const Ps::aos::Mat33V* loads, 
+									   PxReal dt);
+
+};
+
+}
+
+}
+
+#endif //DY_ARTICULATION_HELPER_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationPImpl.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationPImpl.h
new file mode 100644
index 00000000..e73cc373
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationPImpl.h
@@ -0,0 +1,108 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_INTERFACE_H
+#define DY_ARTICULATION_INTERFACE_H
+
+#include "DyArticulationUtils.h"
+
+namespace physx
+{
+
+class PxcConstraintBlockStream;
+class PxcScratchAllocator;
+class PxsConstraintBlockManager;
+struct PxSolverConstraintDesc;
+
+namespace Dy
+{
+	
+	struct ArticulationSolverDesc;
+
+
+class ArticulationPImpl
+{
+public:
+
+	typedef PxU32 (*ComputeUnconstrainedVelocitiesFn)(const ArticulationSolverDesc& desc,
+													 PxReal dt,
+													 PxcConstraintBlockStream& stream,
+													 PxSolverConstraintDesc* constraintDesc,
+													 PxU32& acCount,
+													 PxsConstraintBlockManager& constraintBlockManager,
+													 const PxVec3& gravity, PxU64 contextID);
+
+	typedef void (*UpdateBodiesFn)(const ArticulationSolverDesc& desc,
+								   PxReal dt);
+
+	typedef void (*SaveVelocityFn)(const ArticulationSolverDesc &m);
+
+	static ComputeUnconstrainedVelocitiesFn sComputeUnconstrainedVelocities;
+	static UpdateBodiesFn sUpdateBodies;
+	static SaveVelocityFn sSaveVelocity;
+
+	static PxU32 computeUnconstrainedVelocities(const ArticulationSolverDesc& desc,
+										   PxReal dt,
+										   PxcConstraintBlockStream& stream,
+										   PxSolverConstraintDesc* constraintDesc,
+										   PxU32& acCount,
+										   PxcScratchAllocator&,
+										   PxsConstraintBlockManager& constraintBlockManager,
+										   const PxVec3& gravity, PxU64 contextID)
+	{
+		PX_ASSERT(sComputeUnconstrainedVelocities);
+		if(sComputeUnconstrainedVelocities)
+			return (sComputeUnconstrainedVelocities)(desc, dt, stream, constraintDesc, acCount, constraintBlockManager, gravity, contextID);
+		else
+			return 0;
+	}
+
+	static void	updateBodies(const ArticulationSolverDesc& desc,
+						 PxReal dt)
+	{
+		PX_ASSERT(sUpdateBodies);
+		if(sUpdateBodies)
+			(*sUpdateBodies)(desc, dt);
+	}
+
+	static void	saveVelocity(const ArticulationSolverDesc& desc)
+	{
+		PX_ASSERT(sSaveVelocity);
+		if(sSaveVelocity)
+			(*sSaveVelocity)(desc);
+	}
+};
+
+
+}
+}
+#endif //DY_ARTICULATION_INTERFACE_H
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationReference.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationReference.h
new file mode 100644
index 00000000..ff4d0d6e
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationReference.h
@@ -0,0 +1,92 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_REFERENCE_H
+#define DY_ARTICULATION_REFERENCE_H
+
+// a per-row struct where we put extra data for debug and setup - ultimately this will move to be just
+// debug only
+
+
+
+#include "DyArticulationUtils.h"
+#include "DyArticulationScalar.h"
+#include "DyArticulationFnsScalar.h"
+#include "DySpatial.h"
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+
+namespace physx
+{
+
+PX_FORCE_INLINE Cm::SpatialVector propagateVelocity(const FsRow& row, 
+													const FsJointVectors& jv,
+													const PxVec3& SZ, 
+													const Cm::SpatialVector& v,
+													const FsRowAux& aux)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	Cm::SpatialVector w = Fns::translateMotion(-getParentOffset(jv), v);
+	PxVec3 DSZ = Fns::multiply(row.D, SZ);
+
+	PxVec3 n = Fns::axisDot(getDSI(row), w) + DSZ;
+	Cm::SpatialVector result = w - Cm::SpatialVector(getJointOffset(jv).cross(n),n);
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector check = ArticulationRef::propagateVelocity(row, jv, SZ, v, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+#endif
+	return result;
+}
+
+PX_FORCE_INLINE Cm::SpatialVector propagateImpulse(const FsRow& row, 
+												   const FsJointVectors& jv,
+												   PxVec3& SZ, 
+												   const Cm::SpatialVector& Z,
+												   const FsRowAux& aux)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	SZ = Z.angular + Z.linear.cross(getJointOffset(jv));
+	Cm::SpatialVector result = Fns::translateForce(getParentOffset(jv), Z - Fns::axisMultiply(getDSI(row), SZ));
+#if DY_ARTICULATION_DEBUG_VERIFY
+	PxVec3 SZcheck;
+	Cm::SpatialVector check = ArticulationRef::propagateImpulse(row, jv, SZcheck, Z, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+	PX_ASSERT((SZ-SZcheck).magnitude()<1e-5*PxMax(SZcheck.magnitude(), 1.0f));
+#endif
+	return result;
+}
+
+}
+#endif
+
+#endif //DY_ARTICULATION_REFERENCE_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationSIMD.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationSIMD.cpp
new file mode 100644
index 00000000..e138c192
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationSIMD.cpp
@@ -0,0 +1,306 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "DySpatial.h"
+#include "DyArticulation.h"
+#include "DyArticulationScalar.h"
+#include "DyArticulationFnsScalar.h"
+#include "DyArticulationReference.h"
+#include "DyArticulationFnsSimd.h"
+
+
+namespace physx
+{
+namespace Dy
+{
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+namespace 
+{
+	Cm::SpatialVector SpV(Vec3V linear, Vec3V angular) 
+	{
+		return Cm::SpatialVector((PxVec3 &)linear, (PxVec3&)angular);
+	}
+}
+#endif
+
+void PxcFsApplyImpulse(FsData &matrix, 
+					   PxU32 linkID, 
+					   Vec3V linear,
+					   Vec3V angular)
+{
+#if DY_ARTICULATION_DEBUG_VERIFY
+	{	
+		Cm::SpatialVectorV imp(linear, angular);
+		ArticulationRef::applyImpulse(matrix, reinterpret_cast<Cm::SpatialVector *>(getRefVelocity(matrix)), linkID, reinterpret_cast<Cm::SpatialVector&>(imp));
+	}
+#endif
+
+
+	Vec3V linZ = V3Neg(linear);
+	Vec3V angZ = V3Neg(angular);
+
+	const FsRow *rows = getFsRows(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	const FsRowAux *aux = getAux(matrix);
+#endif
+	Vec3V *deferredSZ = getDeferredSZ(matrix);
+
+	for(PxU32 i = linkID; i!=0; i = matrix.parent[i])
+	{
+		const FsRow &row = rows[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+		PxVec3 SZcheck;
+		Cm::SpatialVector Zcheck = ArticulationRef::propagateImpulse(row, jv, SZcheck, SpV(linZ, angZ), aux[i]);
+#endif
+
+		Vec3V SZ = V3Add(angZ, V3Cross(linZ, jv.jointOffset));
+		Vec3V lrLinear = V3Sub(linZ,  V3ScaleAdd(row.DSI[0].linear, V3GetX(SZ),
+									   V3ScaleAdd(row.DSI[1].linear, V3GetY(SZ),
+									       V3Scale(row.DSI[2].linear, V3GetZ(SZ)))));
+
+		Vec3V lrAngular = V3Sub(angZ,  V3ScaleAdd(row.DSI[0].angular, V3GetX(SZ),
+									    V3ScaleAdd(row.DSI[1].angular, V3GetY(SZ),
+									        V3Scale(row.DSI[2].angular, V3GetZ(SZ)))));
+
+		linZ = lrLinear;
+		angZ = V3Add(lrAngular, V3Cross(jv.parentOffset, lrLinear));
+		deferredSZ[i] = V3Add(deferredSZ[i], SZ);
+
+		PX_ASSERT(Ps::aos::isFiniteVec3V(linZ));
+		PX_ASSERT(Ps::aos::isFiniteVec3V(angZ));
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+		Cm::SpatialVector Z = SpV(linZ,angZ);
+		PX_ASSERT((Z - Zcheck).magnitude()<1e-4*PxMax(Zcheck.magnitude(), 1.0f));
+		PX_ASSERT(((PxVec3&)SZ-SZcheck).magnitude()<1e-4*PxMax(SZcheck.magnitude(), 1.0f));
+#endif
+	}
+
+	matrix.deferredZ.linear = V3Add(matrix.deferredZ.linear, linZ);
+	matrix.deferredZ.angular = V3Add(matrix.deferredZ.angular, angZ);
+
+	matrix.dirty |= rows[linkID].pathToRoot;
+}
+
+Cm::SpatialVectorV PxcFsGetVelocity(FsData &matrix,								  
+								PxU32 linkID)
+{
+	const FsRow *rows = getFsRows(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	const FsRowAux *aux = getAux(matrix);
+#endif
+	Cm::SpatialVectorV* PX_RESTRICT V = getVelocity(matrix);
+
+	// find the dirty node on the path (including the root) with the lowest index
+	ArticulationBitField toUpdate = rows[linkID].pathToRoot & matrix.dirty;
+
+
+	if(toUpdate)
+	{
+		// store the dV elements densely and use an array map to decode - hopefully cache friendlier
+		PxU32 indexToStackLoc[DY_ARTICULATION_MAX_SIZE], count = 0;
+		Cm::SpatialVectorV dVStack[DY_ARTICULATION_MAX_SIZE];
+
+		ArticulationBitField ignoreNodes = (toUpdate & (0-toUpdate))-1;
+		ArticulationBitField path = rows[linkID].pathToRoot & ~ignoreNodes, p = path;
+		ArticulationBitField newDirty = 0;
+
+		Vec3V ldV = V3Zero(), adV = V3Zero();
+		Cm::SpatialVectorV* PX_RESTRICT defV = getDeferredVel(matrix);
+		Vec3V* PX_RESTRICT SZ = getDeferredSZ(matrix);
+
+		if(p & 1)
+		{
+			const FsInertia &m = getRootInverseInertia(matrix);
+			Vec3V lZ = V3Neg(matrix.deferredZ.linear);
+			Vec3V aZ = V3Neg(matrix.deferredZ.angular);
+
+			ldV = V3Add(M33MulV3(m.ll,lZ),		 M33MulV3(m.la,aZ));
+			adV = V3Add(M33TrnspsMulV3(m.la,lZ), M33MulV3(m.aa,aZ));
+
+			V[0].linear = V3Add(V[0].linear, ldV);
+			V[0].angular = V3Add(V[0].angular, adV);
+
+			matrix.deferredZ.linear = V3Zero();
+			matrix.deferredZ.angular = V3Zero();
+
+			indexToStackLoc[0] = count;
+			Cm::SpatialVectorV &e = dVStack[count++];
+
+			e.linear = ldV;
+			e.angular = adV;
+
+			newDirty = rows[0].children;
+			p--;
+		}
+
+		
+		while(p)	// using "for(;p;p &= (p-1))" here generates LHSs from the ArticulationLowestSetBit
+		{
+			PxU32 i = ArticulationLowestSetBit(p);
+			const FsJointVectors& jv = jointVectors[i];
+
+			p &= (p-1);
+	
+			const FsRow* PX_RESTRICT row = rows + i;
+
+			ldV = V3Add(ldV, defV[i].linear);
+			adV = V3Add(adV, defV[i].angular);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+			Cm::SpatialVector dVcheck = ArticulationRef::propagateVelocity(*row, jv, (PxVec3&)SZ[i], SpV(ldV,adV), aux[i]);
+#endif
+
+			Vec3V DSZ = M33MulV3(row->D, SZ[i]);
+
+			Vec3V lW = V3Add(ldV, V3Cross(adV,jv.parentOffset));
+			Vec3V aW = adV;
+
+			const Cm::SpatialVectorV*PX_RESTRICT DSI = row->DSI;
+			Vec3V lN = V3Merge(V3Dot(DSI[0].linear, lW),   V3Dot(DSI[1].linear, lW),  V3Dot(DSI[2].linear, lW));
+			Vec3V aN = V3Merge(V3Dot(DSI[0].angular, aW),  V3Dot(DSI[1].angular, aW), V3Dot(DSI[2].angular, aW));
+
+			Vec3V n = V3Add(V3Add(lN, aN), DSZ);
+
+			ldV = V3Sub(lW, V3Cross(jv.jointOffset,n));
+			adV = V3Sub(aW, n);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+			Cm::SpatialVector dV = SpV(ldV,adV);
+			PX_ASSERT((dV-dVcheck).magnitude()<1e-4*PxMax(dVcheck.magnitude(), 1.0f));
+#endif
+
+			V[i].linear = V3Add(V[i].linear, ldV);
+			V[i].angular = V3Add(V[i].angular, adV);
+
+			defV[i].linear = V3Zero();
+			defV[i].angular = V3Zero();
+			SZ[i] = V3Zero();
+
+			indexToStackLoc[i] = count;
+			Cm::SpatialVectorV &e = dVStack[count++];
+			newDirty |= rows[i].children;
+
+			e.linear = ldV;
+			e.angular = adV;
+		}
+
+		for(ArticulationBitField defer = newDirty&~path; defer; defer &= (defer-1))
+		{
+			PxU32 i = ArticulationLowestSetBit(defer);
+			PxU32 parent = indexToStackLoc[matrix.parent[i]];
+
+			defV[i].linear = V3Add(defV[i].linear, dVStack[parent].linear);
+			defV[i].angular = V3Add(defV[i].angular, dVStack[parent].angular);
+		}
+
+		matrix.dirty = (matrix.dirty | newDirty)&~path;
+	}
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector v = reinterpret_cast<Cm::SpatialVector&>(V[linkID]);
+	Cm::SpatialVector rv = reinterpret_cast<Cm::SpatialVector&>(getRefVelocity(matrix)[linkID]);
+	PX_ASSERT((v-rv).magnitude()<1e-4f * PxMax(rv.magnitude(),1.0f));
+#endif
+
+	return V[linkID];
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV propagateVelocitySIMD(const FsRow& row,
+														const FsJointVectors& jv,
+														const Vec3V& SZ,
+														const Cm::SpatialVectorV& v,
+														const FsRowAux& aux)
+{
+	PX_UNUSED(aux);
+
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	Cm::SpatialVectorV w(V3Add(v.linear, V3Cross(v.angular, jv.parentOffset)), v.angular);
+	Vec3V DSZ = M33MulV3(row.D, SZ);
+
+	Vec3V n = V3Add(Fns::axisDot(row.DSI, w), DSZ);
+	Cm::SpatialVectorV result = w - Cm::SpatialVectorV(V3Cross(jv.jointOffset,n), n);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector check = ArticulationRef::propagateVelocity(row, jv, reinterpret_cast<const PxVec3&>(SZ), reinterpret_cast<const Cm::SpatialVector&>(v), aux);
+	PX_ASSERT((reinterpret_cast<const Cm::SpatialVector&>(result)-check).magnitude()<1e-4*PxMax(check.magnitude(), 1.0f));
+#endif
+
+	return result;
+}
+
+void PxcFsFlushVelocity(FsData& matrix)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	const FsRow* PX_RESTRICT rows = getFsRows(matrix);
+	const FsRowAux* PX_RESTRICT aux = getAux(matrix);
+	const FsJointVectors*PX_RESTRICT jointVectors = getJointVectors(matrix);
+
+	Cm::SpatialVectorV V0 =  Fns::multiply(getRootInverseInertia(matrix), -matrix.deferredZ);
+	matrix.deferredZ = Cm::SpatialVectorV(PxZero);
+
+	getVelocity(matrix)[0] += V0;
+	for(ArticulationBitField defer = rows[0].children; defer; defer &= (defer-1))
+		getDeferredVel(matrix)[ArticulationLowestSetBit(defer)] += V0;
+
+	for(PxU32 i = 1; i<matrix.linkCount; i++)
+	{
+		Cm::SpatialVectorV V = propagateVelocitySIMD(rows[i], jointVectors[i], getDeferredSZ(matrix)[i], getDeferredVel(matrix)[i], aux[i]);
+		getDeferredVel(matrix)[i] = Cm::SpatialVectorV(PxZero);
+		getDeferredSZ(matrix)[i] = V3Zero();
+		getVelocity(matrix)[i] += V;
+		for(ArticulationBitField defer = rows[i].children; defer; defer &= (defer-1))
+			getDeferredVel(matrix)[ArticulationLowestSetBit(defer)] += V;
+	}
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+	{
+		Cm::SpatialVector v = velocityRef(matrix,i), rv = reinterpret_cast<Cm::SpatialVector&>(getRefVelocity(matrix)[i]);
+		Cm::SpatialVector diff = v-rv;
+		PxReal m = rv.magnitude();
+		PX_UNUSED(m);
+		PX_ASSERT(diff.magnitude()<1e-4*PxMax(1.0f,m));
+	}
+#endif
+
+	matrix.dirty = 0;
+}
+}
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationScalar.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationScalar.cpp
new file mode 100644
index 00000000..af00a367
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationScalar.cpp
@@ -0,0 +1,575 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "DyArticulationUtils.h"
+#include "DyArticulationScalar.h"
+#include "DyArticulationReference.h"
+#include "DyArticulationFnsDebug.h"
+
+namespace physx
+{
+namespace Dy
+{
+namespace ArticulationRef
+{
+	Cm::SpatialVector propagateImpulse(const FsRow& row, 
+									   const FsJointVectors& j,
+									   PxVec3& SZ,
+									   const Cm::SpatialVector& Z,
+									   const FsRowAux& aux)
+	{
+		typedef ArticulationFnsScalar Fns;
+	
+		SZ = Fns::axisDot(reinterpret_cast<const Cm::SpatialVector*>(aux.S), Z);
+		return Fns::translateForce(getParentOffset(j), Z - Fns::axisMultiply(getDSI(row), SZ));
+	}
+
+	Cm::SpatialVector propagateVelocity(const FsRow& row,
+									    const FsJointVectors& j,
+										const PxVec3& SZ,
+										const Cm::SpatialVector& v,
+										const FsRowAux& aux)
+	{
+		typedef ArticulationFnsScalar Fns;
+
+		Cm::SpatialVector w = Fns::translateMotion(-getParentOffset(j), v);
+		PxVec3 DSZ = Fns::multiply(row.D, SZ);
+
+		return w - Fns::axisMultiply(reinterpret_cast<const Cm::SpatialVector*>(aux.S), DSZ + Fns::axisDot(getDSI(row), w));
+	}
+
+	void applyImpulse(const FsData& matrix, 
+					  Cm::SpatialVector* velocity,
+					  PxU32 linkID, 
+					  const Cm::SpatialVector& impulse)
+	{
+		typedef ArticulationFnsScalar Fns;
+
+		PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+
+		const FsRow* rows = getFsRows(matrix);
+		const FsRowAux* aux = getAux(matrix);
+		const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+		Cm::SpatialVector dV[DY_ARTICULATION_MAX_SIZE];
+		PxVec3 SZ[DY_ARTICULATION_MAX_SIZE];
+
+		for(PxU32 i=0;i<matrix.linkCount;i++)
+			SZ[i] = PxVec3(0);
+
+		Cm::SpatialVector Z = -impulse;
+
+		for(;linkID!=0; linkID = matrix.parent[linkID])
+			Z = ArticulationRef::propagateImpulse(rows[linkID], jointVectors[linkID], SZ[linkID], Z, aux[linkID]);
+
+		dV[0] = Fns::getRootDeltaV(matrix,-Z);
+
+		for(PxU32 i=1;i<matrix.linkCount; i++)
+			dV[i] = ArticulationRef::propagateVelocity(rows[i], jointVectors[i], SZ[i], dV[matrix.parent[i]], aux[i]);
+
+		for(PxU32 i=0;i<matrix.linkCount;i++)
+			velocity[i] += dV[i];
+	}
+
+	void ltbFactor(FsData& m)
+	{
+		typedef ArticulationFnsScalar Fns;
+		LtbRow* rows = getLtbRows(m);
+
+		SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+		for(PxU32 i=0;i<m.linkCount;i++)
+			inertia[i] = ArticulationFnsDebug::unsimdify(rows[i].inertia);
+
+		Cm::SpatialVector j[3];
+		for(PxU32 i=m.linkCount; --i>0;)
+		{
+			LtbRow& b = rows[i];
+			inertia[i] = Fns::invertInertia(inertia[i]);
+			PxU32 p = m.parent[i];
+
+			Cm::SpatialVector* j0 = &reinterpret_cast<Cm::SpatialVector&>(*b.j0),
+							 * j1 = &reinterpret_cast<Cm::SpatialVector&>(*b.j1);
+
+			Fns::multiply(j, inertia[i], j1);
+			PxMat33 jResponse = Fns::invertSym33(-Fns::multiplySym(j, j1));
+			j1[0] = j[0]; j1[1] = j[1]; j1[2] = j[2];
+
+			b.jResponse = Mat33V_From_PxMat33(jResponse);
+			Fns::multiply(j, j0, jResponse);
+			inertia[p] = Fns::multiplySubtract(inertia[p], j, j0);
+			j0[0] = j[0]; j0[1] = j[1]; j0[2] = j[2];
+		}
+
+		rows[0].inertia = Fns::invertInertia(inertia[0]);
+		for(PxU32 i=1;i<m.linkCount;i++)
+			rows[i].inertia = inertia[i];
+	}
+
+
+}
+
+#if 0
+
+
+void ltbSolve(const FsData& m, 
+			  Vec3V* c,					// rhs error to solve for
+			  Cm::SpatialVector* y)		// velocity delta output
+{
+	typedef ArticulationFnsScalar Fns;
+
+	PxVec4* b = reinterpret_cast<PxVec4*>(c);
+	const LtbRow* rows = getLtbRows(m);
+	PxMemZero(y, m.linkCount*sizeof(Cm::SpatialVector));
+
+	for(PxU32 i=m.linkCount;i-->1;)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		b[i] -= PxVec4(Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j1), y[i]),0);
+		y[p] -= Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j0), b[i].getXYZ());
+	}
+
+	y[0] = Fns::multiply(rows[0].inertia,y[0]);
+
+	for(PxU32 i=1; i<m.linkCount; i++)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		PxVec3 t = Fns::multiply(r.jResponse, b[i].getXYZ()) - Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j0), y[p]);
+		y[i] = Fns::multiply(r.inertia, y[i]) - Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j1), t);
+	}
+}
+
+void PxcFsPropagateDrivenInertiaScalar(FsData& matrix,
+								 const FsInertia* baseInertia,
+								 const PxReal* isf,
+								 const Mat33V* load)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	Cm::SpatialVector IS[3], DSI[3];
+	PxMat33 D;
+
+	FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+		inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+
+	for(PxU32 i=matrix.linkCount; --i>0;)
+	{
+		FsRow& r = rows[i];
+		const FsRowAux& a = aux[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+		Fns::multiply(IS, inertia[i], &static_cast<const Cm::SpatialVector&>(*a.S));
+
+		PX_ALIGN(16, PxMat33) L;
+		PxMat33_From_Mat33V(load[i], L);
+		D = Fns::invertSym33(Fns::multiplySym(&static_cast<const Cm::SpatialVector&>(*a.S), IS) + L*isf[i]);
+
+		Fns::multiply(DSI, IS, D);
+
+		r.D = Mat33V_From_PxMat33(D);
+		static_cast<Cm::SpatialVector&>(r.DSI[0]) = DSI[0];
+		static_cast<Cm::SpatialVector&>(r.DSI[1]) = DSI[1];
+		static_cast<Cm::SpatialVector&>(r.DSI[2]) = DSI[2];
+
+		inertia[matrix.parent[i]] += Fns::translate(getParentOffset(jv), Fns::multiplySubtract(inertia[i], DSI, IS));
+	}
+
+	FsInertia& m = getRootInverseInertia(matrix);
+	m = FsInertia(Fns::invertInertia(inertia[0]));
+}
+
+// no need to compile this ecxcept for verification, and it consumes huge amounts of stack space
+void PxcFsComputeJointLoadsScalar(const FsData& matrix,
+							      const FsInertia*PX_RESTRICT baseInertia,
+							      Mat33V*PX_RESTRICT load,
+   							      const PxReal*PX_RESTRICT isf,
+							      PxU32 linkCount,
+							      PxU32 maxIterations)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	// the childward S
+	SpInertia leafwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia rootwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia contribToParent[DY_ARTICULATION_MAX_SIZE];
+
+	// total articulated inertia assuming the articulation is rooted here
+
+	const FsRow* row = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_UNUSED(row);
+
+	PxMat33 load_[DY_ARTICULATION_MAX_SIZE];
+
+	for(PxU32 iter=0;iter<maxIterations;iter++)
+	{
+		for(PxU32 i=0;i<linkCount;i++)
+			inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+			
+		for(PxU32 i=linkCount;i-->1;)
+		{
+			const FsJointVectors& j = jointVectors[i];
+
+			leafwardInertia[i] = inertia[i];
+			contribToParent[i] = Fns::propagate(inertia[i], &static_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+			inertia[matrix.parent[i]] += Fns::translate((PxVec3&)j.parentOffset, contribToParent[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			rootwardInertia[i] = Fns::translate(-(PxVec3&)jointVectors[i].parentOffset, inertia[matrix.parent[i]]) - contribToParent[i];				
+			inertia[i] += Fns::propagate(rootwardInertia[i], &static_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			load_[i] = Fns::computeDriveInertia(leafwardInertia[i], rootwardInertia[i], &static_cast<const Cm::SpatialVector&>(*aux[i].S));
+			PX_ASSERT(load_[i][0].isFinite() && load_[i][1].isFinite() && load_[2][i].isFinite());
+		}					
+	}
+	for(PxU32 i=1;i<linkCount;i++)
+		load[i] = Mat33V_From_PxMat33(load_[i]);
+}
+
+
+void PxcFsApplyImpulse(const FsData& matrix, 
+					   PxU32 linkID, 
+					   const Cm::SpatialVector& impulse)
+{
+#if DY_ARTICULATION_DEBUG_VERIFY
+	PxcFsRefApplyImpulse(matrix, state.refVelocity, linkID, impulse);
+#endif
+
+	Cm::SpatialVector Z = -impulse;
+
+	for(PxU32 i = linkID; i!=0; i = matrix.row[i].parent)
+	{
+		PxVec3 SZ;
+		Z = propagateImpulse(matrix.row[i], SZ, Z, matrix.aux[i]);
+		deferredSZRef(state,i) += SZ;
+	}
+
+	static_cast<Cm::SpatialVector &>(state.deferredZ) += Z;
+	state.dirty |= matrix.row[linkID].pathToRoot;
+}
+
+Cm::SpatialVector PxcFsGetVelocity(const FsData& matrix,
+								  PxU32 linkID)
+{
+	// find the dirty node on the path (including the root) with the lowest index
+	ArticulationBitField toUpdate = matrix.row[linkID].pathToRoot & state.dirty;
+
+	if(toUpdate)
+	{
+		ArticulationBitField ignoreNodes = (toUpdate & (0-toUpdate))-1;
+		ArticulationBitField path = matrix.row[linkID].pathToRoot & ~ignoreNodes, p = path;
+		ArticulationBitField newDirty = 0;
+
+		Cm::SpatialVector dV = Cm::SpatialVector::zero();
+		if(p & 1)
+		{
+			dV = getRootDeltaV(matrix, -deferredZ(state));
+
+			velocityRef(state, 0) += dV;
+			for(ArticulationBitField defer = matrix.row[0].children & ~path; defer; defer &= (defer-1))
+				deferredVelRef(state, ArticulationLowestSetBit(defer)) += dV;
+
+			deferredZRef(state) = Cm::SpatialVector::zero();
+			newDirty = matrix.row[0].children;
+			p--;
+		}
+
+		for(; p; p &= (p-1))
+		{
+			PxU32 i = ArticulationLowestSetBit(p);
+
+			dV = propagateVelocity(matrix.row[i], deferredSZ(state,i), dV + state.deferredVel[i], matrix.aux[i]);
+
+			velocityRef(state,i) += dV;
+			for(ArticulationBitField defer = matrix.row[i].children & ~path; defer; defer &= (defer-1))
+				deferredVelRef(state,ArticulationLowestSetBit(defer)) += dV;
+
+			newDirty |= matrix.row[i].children;
+			deferredVelRef(state,i) = Cm::SpatialVector::zero();
+			deferredSZRef(state,i) = PxVec3(0);
+		}
+
+		state.dirty = (state.dirty | newDirty)&~path;
+	}
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector v = state.velocity[linkID];
+	Cm::SpatialVector rv = state.refVelocity[linkID];
+	PX_ASSERT((v-rv).magnitude()<1e-4f * rv.magnitude());
+#endif
+
+	return state.velocity[linkID];
+}
+
+void PxcFsFlushVelocity(const FsData& matrix)
+{
+	Cm::SpatialVector V = getRootDeltaV(matrix, -deferredZ(state));
+	deferredZRef(state) = Cm::SpatialVector::zero();
+	velocityRef(state,0) += V;
+	for(ArticulationBitField defer = matrix.row[0].children; defer; defer &= (defer-1))
+		deferredVelRef(state,ArticulationLowestSetBit(defer)) += V;
+
+	for(PxU32 i = 1; i<matrix.linkCount; i++)
+	{
+		Cm::SpatialVector V = propagateVelocity(matrix.row[i], deferredSZ(state,i), state.deferredVel[i], matrix.aux[i]);
+		deferredVelRef(state,i) = Cm::SpatialVector::zero();
+		deferredSZRef(state,i) = PxVec3(0);
+		velocityRef(state,i) += V;
+		for(ArticulationBitField defer = matrix.row[i].children; defer; defer &= (defer-1))
+			deferredVelRef(state,ArticulationLowestSetBit(defer)) += V;
+	}
+
+	state.dirty = 0;
+}
+
+void PxcFsPropagateDrivenInertiaScalar(FsData& matrix,
+									   const FsInertia* baseInertia,
+									   const PxReal* isf,
+									   const Mat33V* load,
+									   PxcFsScratchAllocator allocator)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	Cm::SpatialVectorV IS[3];
+	PxMat33 D;
+
+	FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	FsInertia *inertia = allocator.alloc<FsInertia>(matrix.linkCount);
+	PxMemCopy(inertia, baseInertia, matrix.linkCount*sizeof(FsInertia));
+
+	for(PxU32 i=matrix.linkCount; --i>0;)
+	{
+		FsRow& r = rows[i];
+		const FsRowAux& a = aux[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+		Mat33V m = Fns::computeSIS(inertia[i], a.S, IS);
+		FloatV f = FLoad(isf[i]);
+
+		Mat33V D = Fns::invertSym33(Mat33V(V3ScaleAdd(load[i].col0, f, m.col0),
+										   V3ScaleAdd(load[i].col1, f, m.col1),
+										   V3ScaleAdd(load[i].col2, f, m.col2)));
+		r.D = D;
+
+		inertia[matrix.parent[i]] = Fns::addInertia(inertia[matrix.parent[i]], 
+													Fns::translateInertia(jv.parentOffset, Fns::multiplySubtract(inertia[i], D,  IS,  r.DSI)));
+	}
+
+	getRootInverseInertia(matrix) = Fns::invertInertia(inertia[0]);
+}
+
+void PxcLtbSolve(const FsData& m, 
+				 Vec3V* c,					// rhs error to solve for
+				 Cm::SpatialVector* y)		// velocity delta output
+{
+	typedef ArticulationFnsScalar Fns;
+
+	PxVec4* b = reinterpret_cast<PxVec4*>(c);
+	const LtbRow* rows = getLtbRows(m);
+	PxMemZero(y, m.linkCount*sizeof(Cm::SpatialVector));
+
+	for(PxU32 i=m.linkCount;i-->1;)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		b[i] -= PxVec4(Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j1), y[i]),0);
+		y[p] -= Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j0), b[i].getXYZ());
+	}
+
+	y[0] = Fns::multiply(rows[0].inertia,y[0]);
+
+	for(PxU32 i=1; i<m.linkCount; i++)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		PxVec3 t = Fns::multiply(r.jResponse, b[i].getXYZ()) - Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j0), y[p]);
+		y[i] = Fns::multiply(r.inertia, y[i]) - Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j1), t);
+	}
+}
+
+
+#endif
+
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+void PxcLtbFactorScalar(FsData& m)
+{
+	typedef ArticulationFnsScalar Fns;
+	LtbRow* rows = getLtbRows(m);
+
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<m.linkCount;i++)
+		inertia[i] = ArticulationFnsDebug::unsimdify(rows[i].inertia);
+
+	Cm::SpatialVector j[3];
+	for(PxU32 i=m.linkCount; --i>0;)
+	{
+		LtbRow& b = rows[i];
+		inertia[i] = Fns::invertInertia(inertia[i]);
+		PxU32 p = m.parent[i];
+
+		Cm::SpatialVector* j0 = &reinterpret_cast<Cm::SpatialVector&>(*b.j0),
+						 * j1 = &reinterpret_cast<Cm::SpatialVector&>(*b.j1);
+
+		Fns::multiply(j, inertia[i], j1);
+		PxMat33 jResponse = Fns::invertSym33(-Fns::multiplySym(j, j1));
+		j1[0] = j[0]; j1[1] = j[1]; j1[2] = j[2];
+
+		b.jResponse = Mat33V_From_PxMat33(jResponse);
+		Fns::multiply(j, j0, jResponse);
+		inertia[p] = Fns::multiplySubtract(inertia[p], j, j0);
+		j0[0] = j[0]; j0[1] = j[1]; j0[2] = j[2];
+	}
+
+	rows[0].inertia = Fns::invertInertia(inertia[0]);
+	for(PxU32 i=1;i<m.linkCount;i++)
+		rows[i].inertia = inertia[i];
+}
+
+void PxcFsPropagateDrivenInertiaScalar(FsData& matrix,
+									   const FsInertia* baseInertia,
+									   const PxReal* isf,
+									   const Mat33V* load)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	Cm::SpatialVector IS[3], DSI[3];
+	PxMat33 D;
+
+	FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+		inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+
+	for(PxU32 i=matrix.linkCount; --i>0;)
+	{
+		FsRow& r = rows[i];
+		const FsRowAux& a = aux[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+		Fns::multiply(IS, inertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*a.S));
+
+		PX_ALIGN(16, PxMat33) L;
+		PxMat33_From_Mat33V(load[i], L);
+		D = Fns::invertSym33(Fns::multiplySym(&reinterpret_cast<const Cm::SpatialVector&>(*a.S), IS) + L*isf[i]);
+
+		Fns::multiply(DSI, IS, D);
+
+		r.D = Mat33V_From_PxMat33(D);
+		reinterpret_cast<Cm::SpatialVector&>(r.DSI[0]) = DSI[0];
+		reinterpret_cast<Cm::SpatialVector&>(r.DSI[1]) = DSI[1];
+		reinterpret_cast<Cm::SpatialVector&>(r.DSI[2]) = DSI[2];
+
+		inertia[matrix.parent[i]] += Fns::translate(getParentOffset(jv), Fns::multiplySubtract(inertia[i], DSI, IS));
+	}
+
+	FsInertia& m = getRootInverseInertia(matrix);
+	m = FsInertia(Fns::invertInertia(inertia[0]));
+}
+
+// no need to compile this ecxcept for verification, and it consumes huge amounts of stack space
+void PxcFsComputeJointLoadsScalar(const FsData& matrix,
+								  const FsInertia*PX_RESTRICT baseInertia,
+								  Mat33V*PX_RESTRICT load,
+								  const PxReal*PX_RESTRICT isf,
+								  PxU32 linkCount,
+								  PxU32 maxIterations)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	// the childward S
+	SpInertia leafwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia rootwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia contribToParent[DY_ARTICULATION_MAX_SIZE];
+
+	// total articulated inertia assuming the articulation is rooted here
+
+	const FsRow* row = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_UNUSED(row);
+
+	PxMat33 load_[DY_ARTICULATION_MAX_SIZE];
+
+	for(PxU32 iter=0;iter<maxIterations;iter++)
+	{
+		for(PxU32 i=0;i<linkCount;i++)
+			inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+			
+		for(PxU32 i=linkCount;i-->1;)
+		{
+			const FsJointVectors& j = jointVectors[i];
+
+			leafwardInertia[i] = inertia[i];
+			contribToParent[i] = Fns::propagate(inertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+			inertia[matrix.parent[i]] += Fns::translate((PxVec3&)j.parentOffset, contribToParent[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			rootwardInertia[i] = Fns::translate(-(PxVec3&)jointVectors[i].parentOffset, inertia[matrix.parent[i]]) - contribToParent[i];				
+			inertia[i] += Fns::propagate(rootwardInertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			load_[i] = Fns::computeDriveInertia(leafwardInertia[i], rootwardInertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*aux[i].S));
+			PX_ASSERT(load_[i][0].isFinite() && load_[i][1].isFinite() && load_[2][i].isFinite());
+		}					
+	}
+	for(PxU32 i=1;i<linkCount;i++)
+		load[i] = Mat33V_From_PxMat33(load_[i]);
+}
+#endif
+
+}
+
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationScalar.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationScalar.h
new file mode 100644
index 00000000..8d639de3
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationScalar.h
@@ -0,0 +1,101 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_SCALAR_H
+#define DY_ARTICULATION_SCALAR_H
+
+// Scalar helpers for articulations
+
+#include "foundation/PxUnionCast.h"
+#include "DyArticulationUtils.h"
+#include "DySpatial.h"
+#include "PsFPU.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+PX_FORCE_INLINE Cm::SpatialVector&	velocityRef(FsData &m, PxU32 i)
+{	
+	return reinterpret_cast<Cm::SpatialVector&>(getVelocity(m)[i]); 
+}
+
+PX_FORCE_INLINE Cm::SpatialVector&	deferredVelRef(FsData &m, PxU32 i)
+{	
+	return reinterpret_cast<Cm::SpatialVector&>(getDeferredVel(m)[i]); 
+}
+
+PX_FORCE_INLINE PxVec3& deferredSZRef(FsData &m, PxU32 i)
+{	
+	return reinterpret_cast<PxVec3 &>(getDeferredSZ(m)[i]); 
+}
+
+PX_FORCE_INLINE const PxVec3& deferredSZ(const FsData &s, PxU32 i) 
+{	
+	return reinterpret_cast<const PxVec3 &>(getDeferredSZ(s)[i]); 
+}
+
+PX_FORCE_INLINE Cm::SpatialVector& deferredZRef(FsData &s)
+{
+	return unsimdRef(s.deferredZ);
+}
+
+
+PX_FORCE_INLINE const Cm::SpatialVector& deferredZ(const FsData &s)
+{
+	return unsimdRef(s.deferredZ);
+}
+
+PX_FORCE_INLINE const PxVec3& getJointOffset(const FsJointVectors& j) 
+{	
+	return reinterpret_cast<const PxVec3& >(j.jointOffset);		
+}
+
+PX_FORCE_INLINE const PxVec3& getParentOffset(const FsJointVectors& j) 
+{	
+	return reinterpret_cast<const PxVec3&>(j.parentOffset);		
+}
+
+
+
+
+PX_FORCE_INLINE const Cm::SpatialVector* getDSI(const FsRow& row)
+{	
+	return PxUnionCast<const Cm::SpatialVector*,const Cm::SpatialVectorV*>(row.DSI); //reinterpret_cast<const Cm::SpatialVector*>(row.DSI); 
+}
+
+}
+
+}
+
+#endif //DY_ARTICULATION_SCALAR_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationUtils.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationUtils.h
new file mode 100644
index 00000000..67c4270d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyArticulationUtils.h
@@ -0,0 +1,317 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_H
+#define DY_ARTICULATION_H
+
+#include "PsVecMath.h"
+#include "CmSpatialVector.h"
+#include "DySpatial.h"
+#include "PsBitUtils.h"
+#include "DyArticulation.h"
+#include "DyArticulationHelper.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+	struct ArticulationCore;
+	struct ArticulationLink;
+	typedef size_t ArticulationLinkHandle;
+	class Articulation;
+
+#define DY_ARTICULATION_DEBUG_VERIFY 0
+
+PX_FORCE_INLINE PxU32 ArticulationLowestSetBit(ArticulationBitField val)
+{
+	PxU32 low = PxU32(val&0xffffffff), high = PxU32(val>>32);
+	PxU32 mask = PxU32((!low)-1);
+	PxU32 result = (mask&Ps::lowestSetBitUnsafe(low)) | ((~mask)&(Ps::lowestSetBitUnsafe(high)+32));
+	PX_ASSERT(val & (PxU64(1)<<result));
+	PX_ASSERT(!(val & ((PxU64(1)<<result)-1)));
+	return result;
+}
+
+using namespace Ps::aos;
+
+
+
+PX_FORCE_INLINE Cm::SpatialVector& unsimdRef(Cm::SpatialVectorV& v)				{ return reinterpret_cast<Cm::SpatialVector&>(v); }
+PX_FORCE_INLINE const Cm::SpatialVector& unsimdRef(const Cm::SpatialVectorV& v) { return reinterpret_cast<const Cm::SpatialVector&>(v); }
+
+
+PX_ALIGN_PREFIX(16)
+struct FsJointVectors
+{
+	Vec3V					parentOffset;		// 16 bytes world-space offset from parent to child
+	Vec3V					jointOffset;		// 16 bytes world-space offset from child to joint
+}
+PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct FsRow
+{	
+	Cm::SpatialVectorV			DSI[3];				// 96 bytes
+	Mat33V					D;					// 48 bytes
+	ArticulationBitField	children;			// 8 bytes bitmap of children
+	ArticulationBitField	pathToRoot;			// 8 bytes bitmap of nodes to root, including self and root
+}
+PX_ALIGN_SUFFIX(16);
+
+PX_COMPILE_TIME_ASSERT(sizeof(FsRow)==160);
+
+
+
+PX_ALIGN_PREFIX(16)
+struct FsInertia
+{
+	Mat33V ll, la, aa;
+	PX_FORCE_INLINE FsInertia(const Mat33V& _ll, const Mat33V& _la, const Mat33V& _aa): ll(_ll), la(_la), aa(_aa) {}
+	PX_FORCE_INLINE FsInertia(const SpInertia& I)
+	: ll(Mat33V_From_PxMat33(I.mLL)), la(Mat33V_From_PxMat33(I.mLA)), aa(Mat33V_From_PxMat33(I.mAA)) {}
+	PX_FORCE_INLINE FsInertia() {}
+
+	PX_FORCE_INLINE void operator=(const FsInertia& other)
+	{
+		ll.col0 = other.ll.col0;	ll.col1 = other.ll.col1;	ll.col2 = other.ll.col2;
+		la.col0 = other.la.col0;	la.col1 = other.la.col1;	la.col2 = other.la.col2;
+		aa.col0 = other.aa.col0;	aa.col1 = other.aa.col1;	aa.col2 = other.aa.col2;
+	}
+
+	PX_FORCE_INLINE FsInertia(const FsInertia& other)
+	{
+		ll.col0 = other.ll.col0;	ll.col1 = other.ll.col1;	ll.col2 = other.ll.col2;
+		la.col0 = other.la.col0;	la.col1 = other.la.col1;	la.col2 = other.la.col2;
+		aa.col0 = other.aa.col0;	aa.col1 = other.aa.col1;	aa.col2 = other.aa.col2;
+	}
+
+}PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct LtbRow
+{
+	FsInertia		inertia;			// body inertia in world space
+	Cm::SpatialVectorV		j0[3], j1[3];		// jacobians
+	Mat33V				jResponse;			// inverse response matrix of joint
+	Vec3V				jC;
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct FsRowAux
+{
+	Cm::SpatialVectorV		S[3];				// motion subspace
+}PX_ALIGN_SUFFIX(16);
+
+
+struct FsData
+{
+	Articulation*	articulationX;																//4
+	
+#if !PX_P64_FAMILY
+	PxU32				pad0;																		//8	
+#endif
+	PxU16				linkCount;						// number of links							//10
+	PxU16				jointVectorOffset;				// offset of read-only data					//12
+	PxU16				maxSolverNormalProgress;													//14
+	PxU16				maxSolverFrictionProgress;													//16
+
+	PxU64				dirty;																		//24
+	PxU16				ltbDataOffset;					// offset of save-velocity data				//26
+	PxU16				fsDataOffset;					// offset of joint references				//28
+	PxU32				solverProgress;																//32
+	
+
+	Cm::SpatialVectorV		deferredZ;																	//64
+	PxU8				parent[DY_ARTICULATION_MAX_SIZE];											//128
+};
+
+PX_COMPILE_TIME_ASSERT(0 == (sizeof(FsData) & 0x0f));
+
+#define SOLVER_BODY_SOLVER_PROGRESS_OFFSET 28	
+#define SOLVER_BODY_MAX_SOLVER_PROGRESS_OFFSET 12
+
+namespace
+{
+	template<class T> PX_FORCE_INLINE T addAddr(void* addr, PxU32 increment) 
+	{ 
+		return reinterpret_cast<T>(reinterpret_cast<char*>(addr)+increment);
+	}
+
+	template<class T> PX_FORCE_INLINE T addAddr(const void* addr, PxU32 increment) 
+	{ 
+		return reinterpret_cast<T>(reinterpret_cast<const char*>(addr)+increment);
+	}
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV* getVelocity(FsData& matrix)
+{
+	return addAddr<Cm::SpatialVectorV*>(&matrix, sizeof(FsData));
+}
+
+
+
+
+PX_FORCE_INLINE const Cm::SpatialVectorV* getVelocity(const FsData& matrix)
+{
+	return addAddr<const Cm::SpatialVectorV*>(&matrix, sizeof(FsData));
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV* getDeferredVel(FsData& matrix)
+{
+	return addAddr<Cm::SpatialVectorV*>(getVelocity(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE const Cm::SpatialVectorV* getDeferredVel(const FsData& matrix)
+{
+	return addAddr<const Cm::SpatialVectorV*>(getVelocity(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE Vec3V* getDeferredSZ(FsData& matrix)
+{
+	return addAddr<Vec3V*>(getDeferredVel(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE const Vec3V* getDeferredSZ(const FsData& matrix)
+{
+	return addAddr<const Vec3V*>(getDeferredVel(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE const PxReal* getMaxPenBias(const FsData& matrix)
+{
+	return addAddr<const PxReal*>(getDeferredSZ(matrix), sizeof(Vec3V) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE PxReal* getMaxPenBias(FsData& matrix)
+{
+	return addAddr<PxReal*>(getDeferredSZ(matrix), sizeof(Vec3V) * matrix.linkCount);
+}
+
+
+PX_FORCE_INLINE FsJointVectors* getJointVectors(FsData& matrix)
+{
+	return addAddr<FsJointVectors *>(&matrix,matrix.jointVectorOffset);
+}
+
+PX_FORCE_INLINE const FsJointVectors* getJointVectors(const FsData& matrix)
+{
+	return addAddr<const FsJointVectors *>(&matrix,matrix.jointVectorOffset);
+}
+
+PX_FORCE_INLINE FsInertia& getRootInverseInertia(FsData& matrix)
+{
+	return *addAddr<FsInertia*>(&matrix,matrix.fsDataOffset);
+}
+
+PX_FORCE_INLINE const FsInertia& getRootInverseInertia(const FsData& matrix)
+{
+	return *addAddr<const FsInertia*>(&matrix,matrix.fsDataOffset);
+	
+}
+
+PX_FORCE_INLINE FsRow* getFsRows(FsData& matrix)
+{
+	return addAddr<FsRow*>(&getRootInverseInertia(matrix),sizeof(FsInertia));
+}
+
+PX_FORCE_INLINE const FsRow* getFsRows(const FsData& matrix)
+{
+	return addAddr<const FsRow*>(&getRootInverseInertia(matrix),sizeof(FsInertia));
+}
+
+
+PX_FORCE_INLINE LtbRow* getLtbRows(FsData& matrix)
+{
+	return addAddr<LtbRow*>(&matrix,matrix.ltbDataOffset);
+}
+
+PX_FORCE_INLINE const LtbRow* getLtbRows(const FsData& matrix)
+{
+	return addAddr<const LtbRow*>(&matrix,matrix.ltbDataOffset);
+}
+
+
+PX_FORCE_INLINE Cm::SpatialVectorV* getRefVelocity(FsData& matrix)
+{
+	return addAddr<Cm::SpatialVectorV*>(getLtbRows(matrix), sizeof(LtbRow)*matrix.linkCount);
+}
+
+PX_FORCE_INLINE const Cm::SpatialVectorV* getRefVelocity(const FsData& matrix)
+{
+	return addAddr<const Cm::SpatialVectorV*>(getLtbRows(matrix), sizeof(LtbRow)*matrix.linkCount);
+}
+
+PX_FORCE_INLINE FsRowAux* getAux(FsData& matrix)
+{
+	return addAddr<FsRowAux*>(getRefVelocity(matrix),sizeof(Cm::SpatialVectorV)*matrix.linkCount);
+}
+
+PX_FORCE_INLINE const FsRowAux* getAux(const FsData& matrix)
+{
+	return addAddr<const FsRowAux*>(getRefVelocity(matrix),sizeof(Cm::SpatialVectorV)*matrix.linkCount);
+}
+
+void PxcFsApplyImpulse(FsData& matrix,
+					   PxU32 linkID,
+					   Vec3V linear,
+					   Vec3V angular);
+
+Cm::SpatialVectorV PxcFsGetVelocity(FsData& matrix,
+							    PxU32 linkID);
+
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+namespace ArticulationRef 
+{	
+	Cm::SpatialVector propagateVelocity(const FsRow& row,
+										const FsJointVectors& jv,
+										const PxVec3& SZ, 
+										const Cm::SpatialVector& v, 
+										const FsRowAux& aux); 
+
+	Cm::SpatialVector propagateImpulse(const FsRow& row, 
+									   const FsJointVectors& jv,
+									   PxVec3& SZ, 
+									   const Cm::SpatialVector& Z,	
+									   const FsRowAux& aux); 
+
+	void applyImpulse(const FsData& matrix,
+					  Cm::SpatialVector* velocity,
+					  PxU32 linkID, 
+					  const Cm::SpatialVector& impulse);
+
+}
+#endif
+
+}
+}
+
+#endif //DY_ARTICULATION_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyBodyCoreIntegrator.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyBodyCoreIntegrator.h
new file mode 100644
index 00000000..3e842341
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyBodyCoreIntegrator.h
@@ -0,0 +1,405 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_BODYCORE_INTEGRATOR_H
+#define DY_BODYCORE_INTEGRATOR_H
+
+#include "CmPhysXCommon.h"
+#include "PxvDynamics.h"
+#include "PsMathUtils.h"
+#include "PxsRigidBody.h"
+#include "DySolverBody.h"
+#include "DySleepingConfigulation.h"
+#include "PxsIslandSim.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+PX_FORCE_INLINE void bodyCoreComputeUnconstrainedVelocity
+(const PxVec3& gravity, const PxReal dt, const PxReal linearDamping, const PxReal angularDamping, const PxReal accelScale, 
+const PxReal maxLinearVelocitySq, const PxReal maxAngularVelocitySq, PxVec3& inOutLinearVelocity, PxVec3& inOutAngularVelocity,
+bool disableGravity)
+{
+
+	//Multiply everything that needs multiplied by dt to improve code generation.
+
+	PxVec3 linearVelocity = inOutLinearVelocity;
+	PxVec3 angularVelocity = inOutAngularVelocity;
+	
+	const PxReal linearDampingTimesDT=linearDamping*dt;
+	const PxReal angularDampingTimesDT=angularDamping*dt;
+	const PxReal oneMinusLinearDampingTimesDT=1.0f-linearDampingTimesDT;
+	const PxReal oneMinusAngularDampingTimesDT=1.0f-angularDampingTimesDT;
+
+	//TODO context-global gravity
+	if (!disableGravity)
+	{
+		const PxVec3 linearAccelTimesDT = gravity*dt *accelScale;
+		linearVelocity += linearAccelTimesDT;
+	}
+
+	//Apply damping.
+	const PxReal linVelMultiplier = physx::intrinsics::fsel(oneMinusLinearDampingTimesDT, oneMinusLinearDampingTimesDT, 0.0f);
+	const PxReal angVelMultiplier = physx::intrinsics::fsel(oneMinusAngularDampingTimesDT, oneMinusAngularDampingTimesDT, 0.0f);
+	linearVelocity*=linVelMultiplier;
+	angularVelocity*=angVelMultiplier;
+
+	// Clamp velocity
+	const PxReal linVelSq = linearVelocity.magnitudeSquared();
+	if(linVelSq > maxLinearVelocitySq)
+	{
+		linearVelocity *= PxSqrt(maxLinearVelocitySq / linVelSq);
+	}
+	const PxReal angVelSq = angularVelocity.magnitudeSquared();
+	if(angVelSq > maxAngularVelocitySq)
+	{
+		angularVelocity *= PxSqrt(maxAngularVelocitySq / angVelSq);
+	}
+
+	inOutLinearVelocity = linearVelocity;
+	inOutAngularVelocity = angularVelocity;
+}
+
+
+PX_FORCE_INLINE void integrateCore(PxVec3& motionLinearVelocity, PxVec3& motionAngularVelocity, PxSolverBody& solverBody, PxSolverBodyData& solverBodyData, const PxF32 dt)
+{
+	PxU32 lockFlags = solverBodyData.lockFlags;
+	if (lockFlags)
+	{
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X)
+		{
+			motionLinearVelocity.x = 0.f;
+			solverBody.linearVelocity.x = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y)
+		{
+			motionLinearVelocity.y = 0.f;
+			solverBody.linearVelocity.y = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z)
+		{
+			motionLinearVelocity.z = 0.f;
+			solverBody.linearVelocity.z = 0.f;
+		}
+		
+		//The angular velocity should be 0 because it is now impossible to make it rotate around that axis!
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X)
+		{
+			motionAngularVelocity.x = 0.f;
+			solverBody.angularState.x = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y)
+		{
+			motionAngularVelocity.y = 0.f;
+			solverBody.angularState.y = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z)
+		{
+			motionAngularVelocity.z = 0.f;
+			solverBody.angularState.z = 0.f;
+		}
+	}
+
+	// Integrate linear part
+	PxVec3 linearMotionVel = solverBodyData.linearVelocity + motionLinearVelocity;
+	PxVec3 delta = linearMotionVel * dt;
+	PxVec3 angularMotionVel = solverBodyData.angularVelocity + solverBodyData.sqrtInvInertia * motionAngularVelocity;
+	PxReal w = angularMotionVel.magnitudeSquared();
+	solverBodyData.body2World.p += delta;
+	PX_ASSERT(solverBodyData.body2World.p.isFinite());
+
+	//Store back the linear and angular velocities
+	//core.linearVelocity += solverBody.linearVelocity * solverBodyData.sqrtInvMass;
+	solverBodyData.linearVelocity += solverBody.linearVelocity;
+	solverBodyData.angularVelocity += solverBodyData.sqrtInvInertia * solverBody.angularState;
+	
+	// Integrate the rotation using closed form quaternion integrator
+	if (w != 0.0f)
+	{
+		w = PxSqrt(w);
+		// Perform a post-solver clamping
+		// TODO(dsequeira): ignore this for the moment
+		//just clamp motionVel to half float-range
+		const PxReal maxW = 1e+7f;		//Should be about sqrt(PX_MAX_REAL/2) or smaller
+		if (w > maxW)
+		{
+			angularMotionVel = angularMotionVel.getNormalized() * maxW;
+			w = maxW;
+		}
+		const PxReal v = dt * w * 0.5f;
+		PxReal s, q;
+		Ps::sincos(v, s, q);
+		s /= w;
+
+		const PxVec3 pqr = angularMotionVel * s;
+		const PxQuat quatVel(pqr.x, pqr.y, pqr.z, 0);
+		PxQuat result = quatVel * solverBodyData.body2World.q;
+
+		result += solverBodyData.body2World.q * q;
+
+		solverBodyData.body2World.q = result.getNormalized();
+		PX_ASSERT(solverBodyData.body2World.q.isSane());
+		PX_ASSERT(solverBodyData.body2World.q.isFinite());
+	}
+
+	motionLinearVelocity = linearMotionVel;
+	motionAngularVelocity = angularMotionVel;
+}
+
+
+PX_FORCE_INLINE PxReal updateWakeCounter(PxsRigidBody* originalBody, PxReal dt, PxReal /*invDt*/, const bool enableStabilization, const bool useAdaptiveForce, Cm::SpatialVector& motionVelocity,
+	bool hasStaticTouch)
+{
+	//KS - at most one of these features can be enabled at any time
+	PX_ASSERT(!useAdaptiveForce || !enableStabilization);
+	PxsBodyCore& bodyCore = originalBody->getCore();
+
+	// update the body's sleep state and 
+	PxReal wakeCounterResetTime = 20.0f*0.02f;
+
+	PxReal wc = bodyCore.wakeCounter;
+
+	{
+		if (enableStabilization)
+		{
+			bool freeze = false;
+			const PxTransform& body2World = bodyCore.body2World;
+
+			// calculate normalized energy: kinetic energy divided by mass
+
+			const PxVec3 t = bodyCore.inverseInertia;
+			const PxVec3 inertia(t.x > 0.f ? 1.0f / t.x : 1.f, t.y > 0.f ? 1.0f / t.y : 1.f, t.z > 0.f ? 1.0f / t.z : 1.f);
+
+
+			PxVec3 sleepLinVelAcc = motionVelocity.linear;
+			PxVec3 sleepAngVelAcc = body2World.q.rotateInv(motionVelocity.angular);
+
+			// scale threshold by cluster factor (more contacts => higher sleep threshold)
+			//const PxReal clusterFactor = PxReal(1u + getNumUniqueInteractions());
+
+			PxReal invMass = bodyCore.inverseMass;
+			if (invMass == 0.f)
+				invMass = 1.f;
+
+			const PxReal angular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+			const PxReal linear = sleepLinVelAcc.magnitudeSquared();
+			PxReal frameNormalizedEnergy = 0.5f * (angular + linear);
+
+			const PxReal cf = hasStaticTouch ? PxReal(PxMin(10u, bodyCore.numBodyInteractions)) : 0.f;
+			const PxReal freezeThresh = cf*bodyCore.freezeThreshold;
+
+			originalBody->freezeCount = PxMax(originalBody->freezeCount - dt, 0.0f);
+			bool settled = true;
+
+			PxReal accelScale = PxMin(1.f, originalBody->accelScale + dt);
+
+			if (!hasStaticTouch)
+				accelScale = 1.f;
+
+			if (frameNormalizedEnergy >= freezeThresh)
+			{
+				settled = false;
+				originalBody->freezeCount = PXD_FREEZE_INTERVAL;
+			}
+
+			if (settled)
+			{
+				//Dampen bodies that are just about to go to sleep
+				if (cf > 1.f)
+				{
+					const PxReal sleepDamping = PXD_SLEEP_DAMPING;
+					const PxReal sleepDampingTimesDT = sleepDamping*dt;
+					const PxReal d = 1.0f - sleepDampingTimesDT;
+					bodyCore.linearVelocity = bodyCore.linearVelocity * d;
+					bodyCore.angularVelocity = bodyCore.angularVelocity * d;
+					accelScale = PXD_FREEZE_SCALE;
+				}
+				freeze = originalBody->freezeCount == 0.f && frameNormalizedEnergy < (bodyCore.freezeThreshold * PXD_FREEZE_TOLERANCE);
+			}
+
+			originalBody->accelScale = accelScale;
+
+			if (freeze)
+			{
+				//current flag isn't frozen but freeze flag raise so we need to raise the frozen flag in this frame
+				bool wasNotFrozen = (originalBody->mInternalFlags & PxsRigidBody::eFROZEN) == 0;
+				PxU16 flags = PxU16((originalBody->mInternalFlags & PxsRigidBody::eDISABLE_GRAVITY) | PxsRigidBody::eFROZEN);
+				if (wasNotFrozen)
+				{
+					flags |= PxsRigidBody::eFREEZE_THIS_FRAME;
+				}
+				originalBody->mInternalFlags = flags;
+				bodyCore.body2World = originalBody->getLastCCDTransform();
+			}
+			else
+			{
+				PxU16 flags = PxU16(originalBody->mInternalFlags & PxsRigidBody::eDISABLE_GRAVITY);
+				bool wasFrozen = (originalBody->mInternalFlags & PxsRigidBody::eFROZEN) != 0;
+				if (wasFrozen)
+				{
+					flags |= PxsRigidBody::eUNFREEZE_THIS_FRAME;
+				}
+				originalBody->mInternalFlags = flags;
+			}
+
+			/*KS: New algorithm for sleeping when using stabilization:
+			* Energy *this frame* must be higher than sleep threshold and accumulated energy over previous frames
+			* must be higher than clusterFactor*energyThreshold.
+			*/
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				//Accumulate energy
+				originalBody->sleepLinVelAcc += sleepLinVelAcc;
+				originalBody->sleepAngVelAcc += sleepAngVelAcc;
+
+				//If energy this frame is high
+				if (frameNormalizedEnergy >= bodyCore.sleepThreshold)
+				{
+					//Compute energy over sleep preparation time
+					const PxReal sleepAngular = originalBody->sleepAngVelAcc.multiply(originalBody->sleepAngVelAcc).dot(inertia) * invMass;
+					const PxReal sleepLinear = originalBody->sleepLinVelAcc.magnitudeSquared();
+					PxReal normalizedEnergy = 0.5f * (sleepAngular + sleepLinear);
+					const PxReal sleepClusterFactor = PxReal(1u + bodyCore.numCountedInteractions);
+					// scale threshold by cluster factor (more contacts => higher sleep threshold)
+					const PxReal threshold = sleepClusterFactor*bodyCore.sleepThreshold;
+
+					//If energy over sleep preparation time is high
+					if (normalizedEnergy >= threshold)
+					{
+						//Wake up
+						//PX_ASSERT(isActive());
+						originalBody->sleepAngVelAcc = PxVec3(0);
+						originalBody->sleepLinVelAcc = PxVec3(0);
+
+						const float factor = bodyCore.sleepThreshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+						PxReal oldWc = wc;
+						wc = factor * 0.5f * wakeCounterResetTime + dt * (sleepClusterFactor - 1.0f);
+						bodyCore.solverWakeCounter = wc;
+						//if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+						//	notifyNotReadyForSleeping(bodyCore.nodeIndex);
+
+						if (oldWc == 0.0f)
+							originalBody->mInternalFlags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+
+						return wc;
+					}
+				}
+			}
+
+		}
+		else 
+		{
+			if (useAdaptiveForce)
+			{
+				if (hasStaticTouch && bodyCore.numBodyInteractions > 1)
+					originalBody->accelScale = 1.f / PxReal(bodyCore.numBodyInteractions);
+				else
+					originalBody->accelScale = 1.f;
+			}
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				const PxTransform& body2World = bodyCore.body2World;
+
+				// calculate normalized energy: kinetic energy divided by mass
+				const PxVec3 t = bodyCore.inverseInertia;
+				const PxVec3 inertia(t.x > 0.f ? 1.0f / t.x : 1.f, t.y > 0.f ? 1.0f / t.y : 1.f, t.z > 0.f ? 1.0f / t.z : 1.f);
+
+				PxVec3 sleepLinVelAcc = motionVelocity.linear;
+				PxVec3 sleepAngVelAcc = body2World.q.rotateInv(motionVelocity.angular);
+
+				originalBody->sleepLinVelAcc += sleepLinVelAcc;
+				originalBody->sleepAngVelAcc += sleepAngVelAcc;
+
+				PxReal invMass = bodyCore.inverseMass;
+				if (invMass == 0.f)
+					invMass = 1.f;
+
+				const PxReal angular = originalBody->sleepAngVelAcc.multiply(originalBody->sleepAngVelAcc).dot(inertia) * invMass;
+				const PxReal linear = originalBody->sleepLinVelAcc.magnitudeSquared();
+				PxReal normalizedEnergy = 0.5f * (angular + linear);
+
+				// scale threshold by cluster factor (more contacts => higher sleep threshold)
+				const PxReal clusterFactor = PxReal(1 + bodyCore.numCountedInteractions);
+				const PxReal threshold = clusterFactor*bodyCore.sleepThreshold;
+
+				if (normalizedEnergy >= threshold)
+				{
+					//PX_ASSERT(isActive());
+					originalBody->sleepLinVelAcc = PxVec3(0);
+					originalBody->sleepAngVelAcc = PxVec3(0);
+					const float factor = threshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+					PxReal oldWc = wc;
+					wc = factor * 0.5f * wakeCounterResetTime + dt * (clusterFactor - 1.0f);
+					bodyCore.solverWakeCounter = wc;
+					PxU16 flags = PxU16(originalBody->mInternalFlags & PxsRigidBody::eDISABLE_GRAVITY);
+					if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+					{
+						flags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+						//notifyNotReadyForSleeping(bodyCore.nodeIndex);
+					}
+
+					originalBody->mInternalFlags = flags;
+
+					return wc;
+				}
+			}
+		}
+	}
+
+	wc = PxMax(wc - dt, 0.0f);
+	bodyCore.solverWakeCounter = wc;
+	return wc;
+}
+
+PX_FORCE_INLINE void sleepCheck(PxsRigidBody* originalBody, const PxReal dt, const PxReal intDt, const bool enableStabilization, bool useAdaptiveForce, Cm::SpatialVector& motionVelocity,
+	bool hasStaticTouch)
+{
+
+	PxReal wc = updateWakeCounter(originalBody, dt, intDt, enableStabilization, useAdaptiveForce, motionVelocity, hasStaticTouch);
+	bool wakeCounterZero = (wc == 0.0f);
+
+	if (wakeCounterZero)
+	{
+		//PxsBodyCore& bodyCore = originalBody->getCore();
+		originalBody->mInternalFlags |= PxsRigidBody::eDEACTIVATE_THIS_FRAME;
+		//	notifyReadyForSleeping(bodyCore.nodeIndex);
+		originalBody->sleepLinVelAcc = PxVec3(0);
+		originalBody->sleepAngVelAcc = PxVec3(0);
+	}
+}
+
+}
+
+}
+
+#endif //DY_BODYCORE_INTEGRATOR_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPartition.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPartition.cpp
new file mode 100644
index 00000000..03751640
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPartition.cpp
@@ -0,0 +1,712 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+
+#include "DyConstraintPartition.h"
+#include "DyArticulationUtils.h"
+
+#define INTERLEAVE_SELF_CONSTRAINTS 1
+
+
+namespace physx
+{
+namespace Dy
+{
+
+namespace
+{
+
+PX_FORCE_INLINE PxU32 getArticulationIndex(const uintptr_t eaFsData, const uintptr_t* eas, const PxU32 numEas)
+{
+	PxU32 index=0xffffffff;
+	for(PxU32 i=0;i<numEas;i++)
+	{
+		if(eas[i]==eaFsData)
+		{
+			index=i;
+			break;
+		}
+	}
+	PX_ASSERT(index!=0xffffffff);
+	return index;
+}
+
+
+#define MAX_NUM_PARTITIONS 32
+
+static PxU32 bitTable[32] = 
+{
+	1u<<0, 1u<<1, 1u<<2, 1u<<3, 1u<<4, 1u<<5, 1u<<6, 1u<<7, 1u<<8, 1u<<9, 1u<<10, 1u<<11, 1u<<12, 1u<<13, 1u<<14, 1u<<15, 1u<<16, 1u<<17,
+	1u<<18, 1u<<19, 1u<<20, 1u<<21, 1u<<22, 1u<<23, 1u<<24, 1u<<25, 1u<<26, 1u<<27, 1u<<28, 1u<<29, 1u<<30, 1u<<31
+};
+
+PxU32 getBit(const PxU32 index)
+{
+	PX_ASSERT(index < 32);
+	return bitTable[index];
+}
+
+
+class RigidBodyClassification
+{
+	PxSolverBody* PX_RESTRICT mBodies;
+	PxU32 mNumBodies;
+
+public:
+	RigidBodyClassification(PxSolverBody* PX_RESTRICT bodies, PxU32 numBodies) : mBodies(bodies), mNumBodies(numBodies)
+	{
+	}
+
+	//Returns true if it is a dynamic-dynamic constriant; false if it is a dynamic-static or dynamic-kinematic constraint
+	PX_FORCE_INLINE bool classifyConstraint(const PxSolverConstraintDesc& desc, uintptr_t& indexA, uintptr_t& indexB, bool& activeA, bool& activeB) const
+	{
+		indexA=uintptr_t(desc.bodyA - mBodies);
+		indexB=uintptr_t(desc.bodyB - mBodies);
+		activeA = indexA < mNumBodies;
+		activeB = indexB < mNumBodies;
+		return activeA && activeB;
+	}
+
+	PX_FORCE_INLINE void clearState()
+	{
+		for(PxU32 a = 0; a < mNumBodies; ++a)
+			mBodies[a].solverProgress = 0;
+	}
+
+	PX_FORCE_INLINE void reserveSpaceForStaticConstraints(Ps::Array<PxU32>& numConstraintsPerPartition)
+	{
+		for(PxU32 a = 0; a < mNumBodies; ++a)
+		{
+			mBodies[a].solverProgress = 0;
+
+			PxU32 requiredSize = PxU32(mBodies[a].maxSolverNormalProgress + mBodies[a].maxSolverFrictionProgress);
+			if(requiredSize > numConstraintsPerPartition.size())
+			{
+				numConstraintsPerPartition.resize(requiredSize);
+			}
+
+			for(PxU32 b = 0; b < mBodies[a].maxSolverFrictionProgress; ++b)
+			{
+				numConstraintsPerPartition[mBodies[a].maxSolverNormalProgress + b]++;
+			}
+		}
+	}
+};
+
+class ExtendedRigidBodyClassification
+{
+
+	PxSolverBody* PX_RESTRICT mBodies;
+	PxU32 mNumBodies;
+	uintptr_t* PX_RESTRICT mFsDatas;
+	PxU32 mNumArticulations;
+
+public:
+
+	ExtendedRigidBodyClassification(PxSolverBody* PX_RESTRICT bodies, PxU32 numBodies, uintptr_t* PX_RESTRICT fsDatas, PxU32 numArticulations)
+		: mBodies(bodies), mNumBodies(numBodies), mFsDatas(fsDatas), mNumArticulations(numArticulations)
+	{
+	}
+
+	//Returns true if it is a dynamic-dynamic constriant; false if it is a dynamic-static or dynamic-kinematic constraint
+	PX_FORCE_INLINE bool classifyConstraint(const PxSolverConstraintDesc& desc, uintptr_t& indexA, uintptr_t& indexB, bool& activeA, bool& activeB) const
+	{
+		if(PxSolverConstraintDesc::NO_LINK == desc.linkIndexA)
+		{
+			indexA=uintptr_t(desc.bodyA - mBodies);
+			activeA = indexA < mNumBodies;
+		}
+		else
+		{
+			indexA=mNumBodies+getArticulationIndex(uintptr_t(desc.articulationA),mFsDatas,mNumArticulations);
+			activeA = true;
+		}
+		if(PxSolverConstraintDesc::NO_LINK == desc.linkIndexB)
+		{
+			indexB=uintptr_t(desc.bodyB - mBodies);
+			activeB = indexB < mNumBodies;
+		}
+		else
+		{
+			indexB=mNumBodies+getArticulationIndex(uintptr_t(desc.articulationB),mFsDatas,mNumArticulations);
+			activeB = true;
+		}
+		return activeA && activeB;
+	}
+
+	PX_FORCE_INLINE void clearState()
+	{
+		for(PxU32 a = 0; a < mNumBodies; ++a)
+			mBodies[a].solverProgress = 0;
+
+		for(PxU32 a = 0; a < mNumArticulations; ++a)
+			(reinterpret_cast<FsData*>(mFsDatas[a]))->solverProgress = 0;
+	}
+
+	PX_FORCE_INLINE void reserveSpaceForStaticConstraints(Ps::Array<PxU32>& numConstraintsPerPartition)
+	{
+		for(PxU32 a = 0; a < mNumBodies; ++a)
+		{
+			mBodies[a].solverProgress = 0;
+
+			PxU32 requiredSize = PxU32(mBodies[a].maxSolverNormalProgress + mBodies[a].maxSolverFrictionProgress);
+			if(requiredSize > numConstraintsPerPartition.size())
+			{
+				numConstraintsPerPartition.resize(requiredSize);
+			}
+
+			for(PxU32 b = 0; b < mBodies[a].maxSolverFrictionProgress; ++b)
+			{
+				numConstraintsPerPartition[mBodies[a].maxSolverNormalProgress + b]++;
+			}
+		}
+
+		for(PxU32 a = 0; a < mNumArticulations; ++a)
+		{
+			FsData* data = reinterpret_cast<FsData*>(mFsDatas[a]);
+			data->solverProgress = 0;
+
+			PxU32 requiredSize = PxU32(data->maxSolverNormalProgress + data->maxSolverFrictionProgress);
+			if(requiredSize > numConstraintsPerPartition.size())
+			{
+				numConstraintsPerPartition.resize(requiredSize);
+			}
+
+			for(PxU32 b = 0; b < data->maxSolverFrictionProgress; ++b)
+			{
+				numConstraintsPerPartition[data->maxSolverNormalProgress + b]++;
+			}
+		}
+	}
+
+};
+
+template <typename Classification>
+void classifyConstraintDesc(const PxSolverConstraintDesc* PX_RESTRICT descs, const PxU32 numConstraints, Classification& classification, 
+							Ps::Array<PxU32>& numConstraintsPerPartition, PxSolverConstraintDesc* PX_RESTRICT eaTempConstraintDescriptors)
+{
+	const PxSolverConstraintDesc* _desc = descs;
+	const PxU32 numConstraintsMin1 = numConstraints - 1;
+
+	PxU32 numUnpartitionedConstraints = 0;
+
+	numConstraintsPerPartition.forceSize_Unsafe(32);
+
+	PxMemZero(numConstraintsPerPartition.begin(), sizeof(PxU32) * 32);
+
+	for(PxU32 i = 0; i < numConstraints; ++i, _desc++)
+	{
+		const PxU32 prefetchOffset = PxMin(numConstraintsMin1 - i, 4u);
+		Ps::prefetchLine(_desc[prefetchOffset].constraint);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyA);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyB);
+		Ps::prefetchLine(_desc + 8);
+
+		uintptr_t indexA, indexB;
+		bool activeA, activeB;
+
+		const bool notContainsStatic = classification.classifyConstraint(*_desc, indexA, indexB, activeA, activeB);
+		
+		if(notContainsStatic)
+		{
+			PxU32 partitionsA=_desc->bodyA->solverProgress;
+			PxU32 partitionsB=_desc->bodyB->solverProgress;
+			
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					eaTempConstraintDescriptors[numUnpartitionedConstraints++] = *_desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = getBit(availablePartition);
+				partitionsA |= partitionBit;
+				partitionsB |= partitionBit;
+			}
+
+			_desc->bodyA->solverProgress = partitionsA;
+			_desc->bodyB->solverProgress = partitionsB;
+			numConstraintsPerPartition[availablePartition]++;
+			availablePartition++;
+			_desc->bodyA->maxSolverNormalProgress = PxMax(_desc->bodyA->maxSolverNormalProgress, PxU16(availablePartition));
+			_desc->bodyB->maxSolverNormalProgress = PxMax(_desc->bodyB->maxSolverNormalProgress, PxU16(availablePartition));
+
+			
+		}
+		else
+		{
+			//Just count the number of static constraints and store in maxSolverFrictionProgress...
+			if(activeA)
+				_desc->bodyA->maxSolverFrictionProgress++;
+			else if(activeB)
+				_desc->bodyB->maxSolverFrictionProgress++;
+		}
+	}
+
+	PxU32 partitionStartIndex = 0;
+
+	while(numUnpartitionedConstraints > 0)
+	{
+		classification.clearState();
+
+		partitionStartIndex += 32;
+		//Keep partitioning the un-partitioned constraints and blat the whole thing to 0!
+		numConstraintsPerPartition.resize(32 + numConstraintsPerPartition.size());
+		PxMemZero(numConstraintsPerPartition.begin() + partitionStartIndex, sizeof(PxU32) * 32);
+
+		PxU32 newNumUnpartitionedConstraints = 0;
+
+		for(PxU32 i = 0; i < numUnpartitionedConstraints; ++i)
+		{
+			const PxSolverConstraintDesc& desc = eaTempConstraintDescriptors[i];
+			
+			PxU32 partitionsA=desc.bodyA->solverProgress;
+			PxU32 partitionsB=desc.bodyB->solverProgress;
+				
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					//Need to shuffle around unpartitioned constraints...
+					eaTempConstraintDescriptors[newNumUnpartitionedConstraints++] = desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = getBit(availablePartition);
+				partitionsA |= partitionBit;
+				partitionsB |= partitionBit;
+			}
+
+			desc.bodyA->solverProgress = partitionsA;
+			desc.bodyB->solverProgress = partitionsB;
+			availablePartition += partitionStartIndex;
+			numConstraintsPerPartition[availablePartition]++;
+			availablePartition++;
+			desc.bodyA->maxSolverNormalProgress = PxMax(desc.bodyA->maxSolverNormalProgress, PxU16(availablePartition));
+			desc.bodyB->maxSolverNormalProgress = PxMax(desc.bodyB->maxSolverNormalProgress, PxU16(availablePartition));
+		}
+
+		numUnpartitionedConstraints = newNumUnpartitionedConstraints;
+	}
+
+	classification.reserveSpaceForStaticConstraints(numConstraintsPerPartition);
+
+}
+
+template <typename Classification>
+void writeConstraintDesc(const PxSolverConstraintDesc* PX_RESTRICT descs, const PxU32 numConstraints, Classification& classification,
+						 Ps::Array<PxU32>& accumulatedConstraintsPerPartition, PxSolverConstraintDesc* eaTempConstraintDescriptors,
+							PxSolverConstraintDesc* PX_RESTRICT eaOrderedConstraintDesc)
+{
+	PX_UNUSED(eaTempConstraintDescriptors);
+	const PxSolverConstraintDesc* _desc = descs;
+	const PxU32 numConstraintsMin1 = numConstraints - 1;
+
+	PxU32 numUnpartitionedConstraints = 0;
+
+	for(PxU32 i = 0; i < numConstraints; ++i, _desc++)
+	{
+		const PxU32 prefetchOffset = PxMin(numConstraintsMin1 - i, 4u);
+		Ps::prefetchLine(_desc[prefetchOffset].constraint);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyA);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyB);
+		Ps::prefetchLine(_desc + 8);
+
+		uintptr_t indexA, indexB;
+		bool activeA, activeB;
+		const bool notContainsStatic = classification.classifyConstraint(*_desc, indexA, indexB, activeA, activeB);
+
+		if(notContainsStatic)
+		{
+			PxU32 partitionsA=_desc->bodyA->solverProgress;
+			PxU32 partitionsB=_desc->bodyB->solverProgress;
+			
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					eaTempConstraintDescriptors[numUnpartitionedConstraints++] = *_desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = getBit(availablePartition);
+
+				partitionsA |= partitionBit;
+				partitionsB |= partitionBit;
+			}
+
+			_desc->bodyA->solverProgress = partitionsA;
+			_desc->bodyB->solverProgress = partitionsB;
+
+			eaOrderedConstraintDesc[accumulatedConstraintsPerPartition[availablePartition]++] = *_desc;
+		}
+		else
+		{
+			//Just count the number of static constraints and store in maxSolverFrictionProgress...
+			PxU32 index = 0;
+			if(activeA)
+				index = PxU32(_desc->bodyA->maxSolverNormalProgress + _desc->bodyA->maxSolverFrictionProgress++);
+			else if(activeB)
+				index = PxU32(_desc->bodyB->maxSolverNormalProgress + _desc->bodyB->maxSolverFrictionProgress++);
+
+			eaOrderedConstraintDesc[accumulatedConstraintsPerPartition[index]++] = *_desc;
+		}
+	}
+
+	PxU32 partitionStartIndex = 0;
+
+	while(numUnpartitionedConstraints > 0)
+	{
+		classification.clearState();
+
+		partitionStartIndex += 32;	
+		PxU32 newNumUnpartitionedConstraints = 0;
+
+		for(PxU32 i = 0; i < numUnpartitionedConstraints; ++i)
+		{
+			const PxSolverConstraintDesc& desc = eaTempConstraintDescriptors[i];
+			
+			PxU32 partitionsA=desc.bodyA->solverProgress;
+			PxU32 partitionsB=desc.bodyB->solverProgress;
+				
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					//Need to shuffle around unpartitioned constraints...
+					eaTempConstraintDescriptors[newNumUnpartitionedConstraints++] = desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = getBit(availablePartition);
+
+				partitionsA |= partitionBit;
+				partitionsB |= partitionBit;
+			}
+
+			desc.bodyA->solverProgress = partitionsA;
+			desc.bodyB->solverProgress = partitionsB;
+			availablePartition += partitionStartIndex;
+			eaOrderedConstraintDesc[accumulatedConstraintsPerPartition[availablePartition]++] = desc;
+		}
+
+		numUnpartitionedConstraints = newNumUnpartitionedConstraints;
+	}
+}
+
+}
+
+#define PX_NORMALIZE_PARTITIONS 1
+
+#if PX_NORMALIZE_PARTITIONS
+
+template<typename Classification>
+PxU32 normalizePartitions(Ps::Array<PxU32>& accumulatedConstraintsPerPartition, PxSolverConstraintDesc* PX_RESTRICT eaOrderedConstraintDescriptors, 
+	const PxU32 numConstraintDescriptors, Ps::Array<PxU32>& bitField, const Classification& classification, const PxU32 numBodies, const PxU32 numArticulations)
+{
+	PxU32 numPartitions = 0;
+	
+	PxU32 prevAccumulation = 0;
+	for(; numPartitions < accumulatedConstraintsPerPartition.size() && accumulatedConstraintsPerPartition[numPartitions] > prevAccumulation; 
+		prevAccumulation = accumulatedConstraintsPerPartition[numPartitions++]);
+
+	PxU32 targetSize = (numPartitions == 0 ? 0 : (numConstraintDescriptors)/numPartitions);
+
+	bitField.reserve((numBodies + numArticulations + 31)/32);
+	bitField.forceSize_Unsafe((numBodies + numArticulations + 31)/32);
+
+	for(PxU32 i = numPartitions; i > 0; i--)
+	{
+		PxU32 partitionIndex = i-1;
+
+		//Build the partition mask...
+
+		PxU32 startIndex = partitionIndex == 0 ? 0 : accumulatedConstraintsPerPartition[partitionIndex-1];
+		PxU32 endIndex = accumulatedConstraintsPerPartition[partitionIndex];
+
+		//If its greater than target size, there's nothing that will be pulled into it from earlier partitions
+		if((endIndex - startIndex) >= targetSize)
+			continue;
+
+
+		PxMemZero(bitField.begin(), sizeof(PxU32)*bitField.size());
+
+		for(PxU32 a = startIndex; a < endIndex; ++a)
+		{
+			PxSolverConstraintDesc& desc = eaOrderedConstraintDescriptors[a];
+
+			uintptr_t indexA, indexB;
+			bool activeA, activeB;
+
+			classification.classifyConstraint(desc, indexA, indexB, activeA, activeB);
+
+			if(activeA)
+				bitField[PxU32(indexA)/32] |= getBit(indexA & 31);
+			if(activeB)
+				bitField[PxU32(indexB)/32] |= getBit(indexB & 31);
+		}
+
+		bool bTerm = false;
+		for(PxU32 a = partitionIndex; a > 0 && !bTerm; --a)
+		{
+			PxU32 pInd = a-1;
+
+			PxU32 si = pInd == 0 ? 0 : accumulatedConstraintsPerPartition[pInd-1];
+			PxU32 ei = accumulatedConstraintsPerPartition[pInd];
+
+			for(PxU32 b = ei; b > si && !bTerm; --b)
+			{
+				PxU32 ind = b-1;
+				PxSolverConstraintDesc& desc = eaOrderedConstraintDescriptors[ind];
+
+				uintptr_t indexA, indexB;
+				bool activeA, activeB;
+
+				classification.classifyConstraint(desc, indexA, indexB, activeA, activeB);
+
+				bool canAdd = true;
+
+				if(activeA && (bitField[PxU32(indexA)/32] & (getBit(indexA & 31))))
+					canAdd = false;
+				if(activeB && (bitField[PxU32(indexB)/32] & (getBit(indexB & 31))))
+					canAdd = false;
+
+				if(canAdd)
+				{
+					PxSolverConstraintDesc tmp = eaOrderedConstraintDescriptors[ind];
+
+					if(activeA)
+						bitField[PxU32(indexA)/32] |= (getBit(indexA & 31));
+					if(activeB)
+						bitField[PxU32(indexB)/32] |= (getBit(indexB & 31));
+
+					PxU32 index = ind;
+					for(PxU32 c = pInd; c < partitionIndex; ++c)
+					{
+						PxU32 newIndex = --accumulatedConstraintsPerPartition[c];
+						if(index != newIndex)
+							eaOrderedConstraintDescriptors[index] = eaOrderedConstraintDescriptors[newIndex];	
+						index = newIndex;
+					}
+
+					if(index != ind)
+						eaOrderedConstraintDescriptors[index] = tmp;
+
+					if((accumulatedConstraintsPerPartition[partitionIndex] - accumulatedConstraintsPerPartition[partitionIndex-1]) >= targetSize)
+					{
+						bTerm = true;
+						break;
+					}
+				}
+			}
+		}
+	}
+		
+	PxU32 partitionCount = 0;
+	PxU32 lastPartitionCount = 0;
+	for (PxU32 a = 0; a < numPartitions; ++a)
+	{
+		const PxU32 constraintCount = accumulatedConstraintsPerPartition[a];
+		accumulatedConstraintsPerPartition[partitionCount] = constraintCount;
+		if (constraintCount != lastPartitionCount)
+		{
+			lastPartitionCount = constraintCount;
+			partitionCount++;
+		}
+	}
+
+	accumulatedConstraintsPerPartition.forceSize_Unsafe(partitionCount);
+
+	return partitionCount;
+}
+
+#endif
+
+PxU32 partitionContactConstraints(ConstraintPartitionArgs& args) 
+{
+	PxU32 maxPartition = 0;
+	//Unpack the input data.
+	const PxU32 numBodies=args.mNumBodies;
+	PxSolverBody* PX_RESTRICT eaAtoms=args.mBodies;
+	const PxU32	numArticulations=args.mNumArticulationPtrs;
+	
+	const PxU32 numConstraintDescriptors=args.mNumContactConstraintDescriptors;
+
+	PxSolverConstraintDesc* PX_RESTRICT eaConstraintDescriptors=args.mContactConstraintDescriptors;
+	PxSolverConstraintDesc* PX_RESTRICT eaOrderedConstraintDescriptors=args.mOrderedContactConstraintDescriptors;
+	PxSolverConstraintDesc* PX_RESTRICT eaTempConstraintDescriptors=args.mTempContactConstraintDescriptors;
+
+	Ps::Array<PxU32>& constraintsPerPartition = *args.mConstraintsPerPartition;
+	constraintsPerPartition.forceSize_Unsafe(0);
+
+	for(PxU32 a = 0; a < numBodies; ++a)
+	{
+		PxSolverBody& body = args.mBodies[a];
+		Ps::prefetchLine(&args.mBodies[a], 256);
+		body.solverProgress = 0;
+		//We re-use maxSolverFrictionProgress and maxSolverNormalProgress to record the
+		//maximum partition used by dynamic constraints and the number of static constraints affecting
+		//a body. We use this to make partitioning much cheaper and be able to support 
+		body.maxSolverFrictionProgress = 0;
+		body.maxSolverNormalProgress = 0;
+	}
+
+	PxU32 numOrderedConstraints=0;	
+
+	PxU32 numSelfConstraintBlocks=0;
+
+	if(numArticulations == 0)
+	{
+		RigidBodyClassification classification(eaAtoms, numBodies);
+		classifyConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, constraintsPerPartition,
+			eaTempConstraintDescriptors);
+		
+		PxU32 accumulation = 0;
+		for(PxU32 a = 0; a < constraintsPerPartition.size(); ++a)
+		{
+			PxU32 count = constraintsPerPartition[a];
+			constraintsPerPartition[a] = accumulation;
+			accumulation += count;
+		}
+
+		for(PxU32 a = 0; a < numBodies; ++a)
+		{
+			PxSolverBody& body = args.mBodies[a];
+			Ps::prefetchLine(&args.mBodies[a], 256);
+			body.solverProgress = 0;
+			//Keep the dynamic constraint count but bump the static constraint count back to 0.
+			//This allows us to place the static constraints in the appropriate place when we see them
+			//because we know the maximum index for the dynamic constraints...
+			body.maxSolverFrictionProgress = 0;
+		}
+
+		writeConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, constraintsPerPartition, 
+			eaTempConstraintDescriptors, eaOrderedConstraintDescriptors);
+
+		numOrderedConstraints = numConstraintDescriptors;
+
+		if(!args.enhancedDeterminism)
+			maxPartition = normalizePartitions(constraintsPerPartition, eaOrderedConstraintDescriptors, numConstraintDescriptors, *args.mBitField,
+				classification, numBodies, 0);
+
+	}
+	else
+	{
+		
+		const ArticulationSolverDesc* articulationDescs=args.mArticulationPtrs;
+		PX_ALLOCA(_eaFsData, uintptr_t, numArticulations);
+		uintptr_t* eaFsDatas = _eaFsData;
+		for(PxU32 i=0;i<numArticulations;i++)
+		{
+			FsData* data = articulationDescs[i].fsData;
+			eaFsDatas[i]=uintptr_t(data);
+			data->solverProgress = 0;
+			data->maxSolverFrictionProgress = 0;
+			data->maxSolverNormalProgress = 0;
+		}
+		ExtendedRigidBodyClassification classification(eaAtoms, numBodies, eaFsDatas, numArticulations);
+
+		classifyConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, 
+			constraintsPerPartition, eaTempConstraintDescriptors);
+
+		PxU32 accumulation = 0;
+		for(PxU32 a = 0; a < constraintsPerPartition.size(); ++a)
+		{
+			PxU32 count = constraintsPerPartition[a];
+			constraintsPerPartition[a] = accumulation;
+			accumulation += count;
+		}
+
+		for(PxU32 a = 0; a < numBodies; ++a)
+		{
+			PxSolverBody& body = args.mBodies[a];
+			Ps::prefetchLine(&args.mBodies[a], 256);
+			body.solverProgress = 0;
+			//Keep the dynamic constraint count but bump the static constraint count back to 0.
+			//This allows us to place the static constraints in the appropriate place when we see them
+			//because we know the maximum index for the dynamic constraints...
+			body.maxSolverFrictionProgress = 0;
+		}
+
+		for(PxU32 a = 0; a < numArticulations; ++a)
+		{
+			FsData* data = reinterpret_cast<FsData*>(eaFsDatas[a]);
+			data->solverProgress = 0;
+			data->maxSolverFrictionProgress = 0;
+		}
+
+		writeConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, constraintsPerPartition, 
+			eaTempConstraintDescriptors, eaOrderedConstraintDescriptors);
+
+		numOrderedConstraints = numConstraintDescriptors;
+
+		if (!args.enhancedDeterminism)
+			maxPartition = normalizePartitions(constraintsPerPartition, eaOrderedConstraintDescriptors,  
+				numConstraintDescriptors, *args.mBitField, classification, numBodies, numArticulations);
+
+	}
+
+
+
+	const PxU32 numConstraintsDifferentBodies=numOrderedConstraints;
+
+	PX_ASSERT(numConstraintsDifferentBodies == numConstraintDescriptors);
+
+	//Now handle the articulated self-constraints.
+	PxU32 totalConstraintCount = numConstraintsDifferentBodies;	
+
+	args.mNumSelfConstraintBlocks=numSelfConstraintBlocks;
+
+	args.mNumDifferentBodyConstraints=numConstraintsDifferentBodies;
+	args.mNumSelfConstraints=totalConstraintCount-numConstraintsDifferentBodies;
+
+	if (args.enhancedDeterminism)
+	{
+		PxU32 prevPartitionSize = 0;
+		maxPartition = 0;
+		for (PxU32 a = 0; a < constraintsPerPartition.size(); ++a, maxPartition++)
+		{
+			if (constraintsPerPartition[a] == prevPartitionSize)
+				break;
+			prevPartitionSize = constraintsPerPartition[a];
+		}
+	}
+
+	return maxPartition;
+}
+
+}
+
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPartition.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPartition.h
new file mode 100644
index 00000000..ba4c8c29
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPartition.h
@@ -0,0 +1,79 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_CONSTRAINTPARTITION_H
+#define DY_CONSTRAINTPARTITION_H
+
+#include "DyDynamics.h"
+
+
+
+namespace physx
+{
+
+namespace Dy
+{
+struct ConstraintPartitionArgs
+{
+	enum
+	{
+		eMAX_NUM_BODIES = 8192
+	};   
+
+	//Input
+	PxSolverBody*							mBodies;
+	PxU32									mNumBodies;
+	ArticulationSolverDesc*				mArticulationPtrs;
+	PxU32									mNumArticulationPtrs;
+	PxSolverConstraintDesc*				mContactConstraintDescriptors;
+	PxU32									mNumContactConstraintDescriptors;
+	//output
+	PxSolverConstraintDesc*				mOrderedContactConstraintDescriptors;
+	PxSolverConstraintDesc*				mTempContactConstraintDescriptors;
+	PxU32									mNumSelfConstraintBlocks;
+	PxU32									mNumDifferentBodyConstraints;
+	PxU32									mNumSelfConstraints;
+	Ps::Array<PxU32>*						mConstraintsPerPartition;
+	//Ps::Array<PxU32>*						mStartIndices;
+	Ps::Array<PxU32>*						mBitField;
+
+	bool									enhancedDeterminism;
+};
+
+PxU32 partitionContactConstraints(ConstraintPartitionArgs& args);
+
+} // namespace physx
+
+}
+
+
+
+#endif // DY_CONSTRAINTPARTITION_H  
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPrep.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPrep.h
new file mode 100644
index 00000000..e7202a78
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintPrep.h
@@ -0,0 +1,92 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_CONSTRAINTSHADER_H
+#define DY_CONSTRAINTSHADER_H
+
+#include "DyConstraint.h"
+
+#include "DySolverConstraintDesc.h"
+#include "PsArray.h"
+
+namespace physx
+{
+
+class PxcConstraintBlockStream;
+class PxsConstraintBlockManager;
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+
+namespace Dy
+{
+
+	static const PxU32 MAX_CONSTRAINT_ROWS = 12;
+
+struct SolverConstraintShaderPrepDesc
+{
+	const Constraint* constraint;
+	PxConstraintSolverPrep solverPrep;
+	const void* constantBlock;
+	PxU32 constantBlockByteSize;
+};
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+	(SolverConstraintShaderPrepDesc* PX_RESTRICT constraintShaderDescs,
+	PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+		const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+		 PxConstraintAllocator& allocator);
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+	(PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+	const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+	PxConstraintAllocator& allocator, PxU32 maxRows);
+
+PxU32 SetupSolverConstraint(SolverConstraintShaderPrepDesc& shaderDesc,
+							PxSolverConstraintPrepDesc& prepDesc,
+							   PxConstraintAllocator& allocator,
+							   PxReal dt, PxReal invdt);
+
+
+class ConstraintHelper
+{
+public:
+
+	static PxU32 setupSolverConstraint(
+		PxSolverConstraintPrepDesc& prepDesc,
+		PxConstraintAllocator& allocator,
+		PxReal dt, PxReal invdt);
+};
+
+}
+
+}
+
+#endif //DY_CONSTRAINTSHADER_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetup.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetup.cpp
new file mode 100644
index 00000000..c5777c12
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetup.cpp
@@ -0,0 +1,594 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxMemory.h"
+#include "DyConstraintPrep.h"
+#include "PxsRigidBody.h"
+#include "DySolverConstraint1D.h"
+#include "PsSort.h"
+#include "DySolverConstraintDesc.h"
+#include "PxcConstraintBlockStream.h"
+#include "DyArticulationContactPrep.h"
+#include "PsFoundation.h"
+
+namespace physx
+{
+namespace Dy
+{
+	// dsequeira:
+	//
+	// we can choose any linear combination of equality constraints and get the same solution
+	// Hence we can orthogonalize the constraints using the inner product given by the
+	// inverse mass matrix, so that when we use PGS, solving a constraint row for a joint 
+	// don't disturb the solution of prior rows.
+	//
+	// We also eliminate the equality constraints from the hard inequality constraints - 
+	// (essentially projecting the direction corresponding to the lagrange multiplier 
+	// onto the equality constraint subspace) but 'til I've verified this generates 
+	// exactly the same KKT/complementarity conditions, status is 'experimental'. 	
+	//
+	// since for equality constraints the resulting rows have the property that applying
+	// an impulse along one row doesn't alter the projected velocity along another row, 
+	// all equality constraints (plus one inequality constraint) can be processed in parallel
+	// using SIMD
+	//
+	// Eliminating the inequality constraints from each other would require a solver change
+	// and not give us any more parallelism, although we might get better convergence.
+
+namespace
+{
+	PX_FORCE_INLINE Vec3V V3FromV4(Vec4V x)			{ return Vec3V_From_Vec4V(x); }
+	PX_FORCE_INLINE Vec3V V3FromV4Unsafe(Vec4V x)	{ return Vec3V_From_Vec4V_WUndefined(x); }
+	PX_FORCE_INLINE Vec4V V4FromV3(Vec3V x)			{ return Vec4V_From_Vec3V(x); }
+	//PX_FORCE_INLINE Vec4V V4ClearW(Vec4V x)			{ return V4SetW(x, FZero()); }
+
+struct MassProps
+{
+	FloatV invMass0;
+	FloatV invMass1;
+	FloatV invInertiaScale0;
+	FloatV invInertiaScale1;
+
+	PX_FORCE_INLINE MassProps(const PxSolverBodyData& bd0,
+							  const PxSolverBodyData& bd1,
+							  const PxConstraintInvMassScale& ims)
+	:	
+		invMass0(FLoad(bd0.invMass * ims.linear0))
+	,	invMass1(FLoad(bd1.invMass * ims.linear1))
+	,	invInertiaScale0(FLoad(ims.angular0))
+	,	invInertiaScale1(FLoad(ims.angular1))
+	{}
+};
+
+
+PX_FORCE_INLINE PxReal innerProduct(const Px1DConstraint& row0, Px1DConstraint& row1, 
+								 PxVec4& row0AngSqrtInvInertia0, PxVec4& row0AngSqrtInvInertia1, 
+								 PxVec4& row1AngSqrtInvInertia0, PxVec4& row1AngSqrtInvInertia1, const MassProps& m)
+{
+	const Vec3V l0 = V3Mul(V3Scale(V3LoadA(row0.linear0), m.invMass0), V3LoadA(row1.linear0));
+	const Vec3V l1 = V3Mul(V3Scale(V3LoadA(row0.linear1), m.invMass1), V3LoadA(row1.linear1));
+	Vec4V r0ang0 = V4LoadA(&row0AngSqrtInvInertia0.x);
+	Vec4V r1ang0 = V4LoadA(&row1AngSqrtInvInertia0.x);
+	Vec4V r0ang1 = V4LoadA(&row0AngSqrtInvInertia1.x);
+	Vec4V r1ang1 = V4LoadA(&row1AngSqrtInvInertia1.x);
+
+	const Vec3V i0 = V3ScaleAdd(V3Mul(Vec3V_From_Vec4V(r0ang0), Vec3V_From_Vec4V(r1ang0)), m.invInertiaScale0, l0);
+	const Vec3V i1 = V3ScaleAdd(V3MulAdd(Vec3V_From_Vec4V(r0ang1), Vec3V_From_Vec4V(r1ang1), i0), m.invInertiaScale1, l1);
+	PxF32 f;
+	FStore(V3SumElems(i1), &f);
+	return f;
+}
+
+
+// indexed rotation around axis, with sine and cosine of half-angle
+PX_FORCE_INLINE PxQuat indexedRotation(PxU32 axis, PxReal s, PxReal c)
+{
+	PxQuat q(0,0,0,c);
+	reinterpret_cast<PxReal*>(&q)[axis] = s;
+	return q;
+}
+
+PxQuat diagonalize(const PxMat33& m)	// jacobi rotation using quaternions 
+{
+	const PxU32 MAX_ITERS = 5;
+
+	PxQuat q = PxQuat(PxIdentity);
+
+	PxMat33 d;
+	for(PxU32 i=0; i < MAX_ITERS;i++)
+	{
+		const PxMat33 axes(q);
+		d = axes.getTranspose() * m * axes;
+
+		const PxReal d0 = PxAbs(d[1][2]), d1 = PxAbs(d[0][2]), d2 = PxAbs(d[0][1]);
+		const PxU32 a = PxU32(d0 > d1 && d0 > d2 ? 0 : d1 > d2 ? 1 : 2);						// rotation axis index, from largest off-diagonal element
+
+		const PxU32 a1 = Ps::getNextIndex3(a), a2 = Ps::getNextIndex3(a1);											
+		if(d[a1][a2] == 0.0f || PxAbs(d[a1][a1]-d[a2][a2]) > 2e6f*PxAbs(2.0f*d[a1][a2]))
+			break;
+
+		const PxReal w = (d[a1][a1]-d[a2][a2]) / (2.0f*d[a1][a2]);					// cot(2 * phi), where phi is the rotation angle
+		const PxReal absw = PxAbs(w);
+
+		PxQuat r;
+		if(absw>1000)
+			r = indexedRotation(a, 1.0f/(4.0f*w), 1.f);									// h will be very close to 1, so use small angle approx instead
+		else
+		{
+  			const PxReal t = 1 / (absw + PxSqrt(w*w+1));								// absolute value of tan phi
+			const PxReal h = 1 / PxSqrt(t*t+1);										// absolute value of cos phi
+
+			PX_ASSERT(h!=1);													// |w|<1000 guarantees this with typical IEEE754 machine eps (approx 6e-8)
+			r = indexedRotation(a, PxSqrt((1-h)/2) * PxSign(w), PxSqrt((1+h)/2));
+		}
+	
+		q = (q*r).getNormalized();
+	}
+
+	return q;
+}
+
+
+PX_FORCE_INLINE void rescale(const Mat33V& m, PxVec3& a0, PxVec3& a1, PxVec3& a2)
+{
+	const Vec3V va0 = V3LoadU(a0);
+	const Vec3V va1 = V3LoadU(a1);
+	const Vec3V va2 = V3LoadU(a2);
+
+	const Vec3V b0 = V3ScaleAdd(va0, V3GetX(m.col0), V3ScaleAdd(va1, V3GetY(m.col0), V3Scale(va2, V3GetZ(m.col0))));
+	const Vec3V b1 = V3ScaleAdd(va0, V3GetX(m.col1), V3ScaleAdd(va1, V3GetY(m.col1), V3Scale(va2, V3GetZ(m.col1))));
+	const Vec3V b2 = V3ScaleAdd(va0, V3GetX(m.col2), V3ScaleAdd(va1, V3GetY(m.col2), V3Scale(va2, V3GetZ(m.col2))));
+
+	V3StoreU(b0, a0);
+	V3StoreU(b1, a1);
+	V3StoreU(b2, a2);
+}
+
+PX_FORCE_INLINE void rescale4(const Mat33V& m, PxReal* a0, PxReal* a1, PxReal* a2)
+{
+	const Vec4V va0 = V4LoadA(a0);
+	const Vec4V va1 = V4LoadA(a1);
+	const Vec4V va2 = V4LoadA(a2);
+
+	const Vec4V b0 = V4ScaleAdd(va0, V3GetX(m.col0), V4ScaleAdd(va1, V3GetY(m.col0), V4Scale(va2, V3GetZ(m.col0))));
+	const Vec4V b1 = V4ScaleAdd(va0, V3GetX(m.col1), V4ScaleAdd(va1, V3GetY(m.col1), V4Scale(va2, V3GetZ(m.col1))));
+	const Vec4V b2 = V4ScaleAdd(va0, V3GetX(m.col2), V4ScaleAdd(va1, V3GetY(m.col2), V4Scale(va2, V3GetZ(m.col2))));
+
+	V4StoreA(b0, a0);
+	V4StoreA(b1, a1);
+	V4StoreA(b2, a2);
+}
+
+
+template<typename T>
+PX_FORCE_INLINE void rescale(const PxMat33& m, T& a0, T& a1, T& a2)
+{
+	T b0 = a0*m(0,0) + a1 * m(1,0) + a2 * m(2,0);
+	T b1 = a0*m(0,1) + a1 * m(1,1) + a2 * m(2,1);
+	T b2 = a0*m(0,2) + a1 * m(1,2) + a2 * m(2,2);
+
+	a0 = b0;
+	a1 = b1;
+	a2 = b2;
+}
+
+void diagonalize(Px1DConstraint** row,
+				 PxVec4* angSqrtInvInertia0,
+				 PxVec4* angSqrtInvInertia1,
+				 const MassProps &m)
+{
+	PxReal a00 = innerProduct(*row[0], *row[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], m);
+	PxReal a01 = innerProduct(*row[0], *row[1], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
+	PxReal a02 = innerProduct(*row[0], *row[2], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+	PxReal a11 = innerProduct(*row[1], *row[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
+	PxReal a12 = innerProduct(*row[1], *row[2], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+	PxReal a22 = innerProduct(*row[2], *row[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+
+	PxMat33 a(PxVec3(a00, a01, a02),
+			  PxVec3(a01, a11, a12),
+			  PxVec3(a02, a12, a22));
+
+	PxQuat q = diagonalize(a);
+
+	PxMat33 n(-q);
+
+	Mat33V mn(V3LoadU(n.column0), V3LoadU(n.column1), V3LoadU(n.column2));
+
+	//KS - We treat as a Vec4V so that we get geometricError rescaled for free along with linear0
+	rescale4(mn, &row[0]->linear0.x, &row[1]->linear0.x, &row[2]->linear0.x);
+	rescale(mn, row[0]->linear1, row[1]->linear1, row[2]->linear1);
+	//KS - We treat as a PxVec4 so that we get velocityTarget rescaled for free 
+	rescale4(mn, &row[0]->angular0.x, &row[1]->angular0.x, &row[2]->angular0.x);
+	rescale(mn, row[0]->angular1, row[1]->angular1, row[2]->angular1);
+	rescale4(mn, &angSqrtInvInertia0[0].x, &angSqrtInvInertia0[1].x, &angSqrtInvInertia0[2].x);
+	rescale4(mn, &angSqrtInvInertia1[0].x, &angSqrtInvInertia1[1].x, &angSqrtInvInertia1[2].x);
+	
+}
+
+void orthogonalize(Px1DConstraint** row,
+				   PxVec4* angSqrtInvInertia0,
+				   PxVec4* angSqrtInvInertia1,
+				   PxU32 rowCount,
+				   PxU32 eqRowCount,
+				   const MassProps &m)
+{
+	PX_ASSERT(eqRowCount<=6);
+
+	const FloatV zero = FZero();
+
+	Vec3V lin1m[6], ang1m[6], lin1[6], ang1[6];	
+	Vec4V lin0m[6], ang0m[6];			// must have 0 in the W-field
+	Vec4V lin0AndG[6], ang0AndT[6];
+
+	for(PxU32 i=0;i<rowCount;i++)
+	{
+		Vec4V l0AndG = V4LoadA(&row[i]->linear0.x);		// linear0 and geometric error
+		Vec4V a0AndT = V4LoadA(&row[i]->angular0.x);	// angular0 and velocity target
+
+		Vec3V l1 = V3FromV4(V4LoadA(&row[i]->linear1.x));
+		Vec3V a1 = V3FromV4(V4LoadA(&row[i]->angular1.x));
+
+		Vec4V angSqrtL0 = V4LoadA(&angSqrtInvInertia0[i].x);
+		Vec4V angSqrtL1 = V4LoadA(&angSqrtInvInertia1[i].x);
+
+		PxU32 eliminationRows = PxMin<PxU32>(i, eqRowCount);
+		for(PxU32 j=0;j<eliminationRows;j++)
+		{
+			const Vec3V s0 = V3MulAdd(l1, lin1m[j], V3FromV4Unsafe(V4Mul(l0AndG, lin0m[j])));
+			const Vec3V s1 = V3MulAdd(V3FromV4Unsafe(angSqrtL1), ang1m[j], V3FromV4Unsafe(V4Mul(angSqrtL0, ang0m[j])));
+			FloatV t = V3SumElems(V3Add(s0, s1));
+
+			l0AndG = V4NegScaleSub(lin0AndG[j], t, l0AndG);
+			a0AndT = V4NegScaleSub(ang0AndT[j], t, a0AndT);
+			l1 = V3NegScaleSub(lin1[j], t, l1);
+			a1 = V3NegScaleSub(ang1[j], t, a1);
+			angSqrtL0 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia0[j].x), t, angSqrtL0);
+			angSqrtL1 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia1[j].x), t, angSqrtL1);
+		}
+
+		V4StoreA(l0AndG, &row[i]->linear0.x);
+		V4StoreA(a0AndT, &row[i]->angular0.x);
+		V3StoreA(l1, row[i]->linear1);
+		V3StoreA(a1, row[i]->angular1);
+		V4StoreA(angSqrtL0, &angSqrtInvInertia0[i].x);
+		V4StoreA(angSqrtL1, &angSqrtInvInertia1[i].x);
+
+		if(i<eqRowCount)
+		{
+			lin0AndG[i] = l0AndG;	
+			ang0AndT[i] = a0AndT;
+			lin1[i] = l1;	
+			ang1[i] = a1;	
+			
+			const Vec3V l0 = V3FromV4(l0AndG);
+
+			const Vec3V l0m = V3Scale(l0, m.invMass0);
+			const Vec3V l1m = V3Scale(l1, m.invMass1);
+			const Vec4V a0m = V4Scale(angSqrtL0, m.invInertiaScale0);
+			const Vec4V a1m = V4Scale(angSqrtL1, m.invInertiaScale1);
+
+			const Vec3V s0 = V3MulAdd(l0, l0m, V3Mul(l1, l1m));
+			const Vec4V s1 = V4MulAdd(a0m, angSqrtL0, V4Mul(a1m, angSqrtL1));
+			const FloatV s = V3SumElems(V3Add(s0, V3FromV4Unsafe(s1)));
+			const FloatV a = FSel(FIsGrtr(s, zero), FRecip(s), zero);	// with mass scaling, it's possible for the inner product of a row to be zero
+
+			lin0m[i] = V4Scale(V4ClearW(V4FromV3(l0m)), a);	
+			ang0m[i] = V4Scale(V4ClearW(a0m), a);
+			lin1m[i] = V3Scale(l1m, a);
+			ang1m[i] = V3Scale(V3FromV4Unsafe(a1m), a);
+		}
+	}
+}
+}
+
+
+void preprocessRows(Px1DConstraint** sorted, 
+					Px1DConstraint* rows,
+					PxVec4* angSqrtInvInertia0,
+					PxVec4* angSqrtInvInertia1,
+					PxU32 rowCount,
+					const PxSolverBodyData& bd0,
+					const PxSolverBodyData& bd1,
+					const PxConstraintInvMassScale& ims,
+					bool disablePreprocessing,
+					bool diagonalizeDrive)
+{
+	// j is maxed at 12, typically around 7, so insertion sort is fine
+	for(PxU32 i=0; i<rowCount; i++)
+	{
+		Px1DConstraint* r = rows+i;
+		
+		PxU32 j = i;
+		for(;j>0 && r->solveHint < sorted[j-1]->solveHint; j--)
+			sorted[j] = sorted[j-1];
+
+		sorted[j] = r;
+	}
+
+	for(PxU32 i=0;i<rowCount-1;i++)
+		PX_ASSERT(sorted[i]->solveHint <= sorted[i+1]->solveHint);
+
+	for (PxU32 i = 0; i<rowCount; i++)
+		rows[i].forInternalUse = rows[i].flags & Px1DConstraintFlag::eKEEPBIAS ? rows[i].geometricError : 0;
+
+
+	const Mat33V sqrtInvInertia0 = Mat33V(V3LoadU(bd0.sqrtInvInertia.column0), V3LoadU(bd0.sqrtInvInertia.column1),
+		V3LoadU(bd0.sqrtInvInertia.column2));
+
+	const Mat33V sqrtInvInertia1 = Mat33V(V3LoadU(bd1.sqrtInvInertia.column0), V3LoadU(bd1.sqrtInvInertia.column1),
+		V3LoadU(bd1.sqrtInvInertia.column2));
+
+	PX_ASSERT(((uintptr_t(angSqrtInvInertia0)) & 0xF) == 0);
+	PX_ASSERT(((uintptr_t(angSqrtInvInertia1)) & 0xF) == 0);
+
+	for(PxU32 i = 0; i < rowCount; ++i)
+	{
+		const Vec3V angDelta0 = M33MulV3(sqrtInvInertia0, V3LoadU(sorted[i]->angular0));
+		const Vec3V angDelta1 = M33MulV3(sqrtInvInertia1, V3LoadU(sorted[i]->angular1));
+		V4StoreA(Vec4V_From_Vec3V(angDelta0), &angSqrtInvInertia0[i].x);
+		V4StoreA(Vec4V_From_Vec3V(angDelta1), &angSqrtInvInertia1[i].x);
+	}
+
+	if(disablePreprocessing)
+		return;
+
+	MassProps m(bd0, bd1, ims);
+	for(PxU32 i=0;i<rowCount;)
+	{
+		const PxU32 groupMajorId = PxU32(sorted[i]->solveHint>>8), start = i++;
+		while(i<rowCount && PxU32(sorted[i]->solveHint>>8) == groupMajorId)
+			i++;
+
+		if(groupMajorId == 4)
+		{
+			PxU32 bCount = start;		// count of bilateral constraints 
+			for(; bCount<i && (sorted[bCount]->solveHint&255)==0; bCount++)
+				;
+			orthogonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, i-start, bCount-start, m);
+		}
+
+		if(groupMajorId == 1 && diagonalizeDrive)
+		{			
+			PxU32 slerp = start;		// count of bilateral constraints 
+			for(; slerp<i && (sorted[slerp]->solveHint&255)!=2; slerp++)
+				;
+			if(slerp+3 == i)
+				diagonalize(sorted+slerp, angSqrtInvInertia0+slerp, angSqrtInvInertia1+slerp, m);
+
+			PX_ASSERT(i-start==3);
+			diagonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, m);
+		}
+	}
+}
+
+
+
+
+
+PxU32 ConstraintHelper::setupSolverConstraint(
+PxSolverConstraintPrepDesc& prepDesc,
+PxConstraintAllocator& allocator,
+PxReal dt, PxReal invdt)
+{
+	if (prepDesc .numRows== 0)
+		return 0;
+
+	PxSolverConstraintDesc& desc = *prepDesc.desc;
+
+	bool isExtended = desc.linkIndexA != PxSolverConstraintDesc::NO_LINK
+		|| desc.linkIndexB != PxSolverConstraintDesc::NO_LINK;
+
+	PxU32 stride = isExtended ? sizeof(SolverConstraint1DExt) : sizeof(SolverConstraint1D);
+	const PxU32 constraintLength = sizeof(SolverConstraint1DHeader) + stride * prepDesc.numRows;
+	
+	//KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we
+	//know SPU DMAs have completed)
+	PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u);
+	if(NULL == ptr || (reinterpret_cast<PxU8*>(-1))==ptr)
+	{
+		if(NULL==ptr)
+		{
+			PX_WARN_ONCE(
+				"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+				"Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			return 0;
+		}
+		else
+		{
+			PX_WARN_ONCE(
+				"Attempting to allocate more than 16K of constraint data. "
+				"Either accept joints detaching/exploding or simplify constraints.");
+			ptr=NULL;
+			return 0;
+		}
+	}
+	desc.constraint = ptr;
+
+	setConstraintLength(desc,constraintLength);
+
+	desc.writeBack = prepDesc.writeback;
+	setWritebackLength(desc, sizeof(ConstraintWriteback));
+
+	memset(desc.constraint, 0, constraintLength);
+
+	SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
+	PxU8* constraints = desc.constraint + sizeof(SolverConstraint1DHeader);
+	init(*header, Ps::to8(prepDesc.numRows), isExtended, prepDesc.mInvMassScales);
+	header->body0WorldOffset = prepDesc.body0WorldOffset;
+	header->linBreakImpulse = prepDesc.linBreakForce * dt;
+	header->angBreakImpulse = prepDesc.angBreakForce * dt;
+	header->breakable = PxU8((prepDesc.linBreakForce != PX_MAX_F32) || (prepDesc.angBreakForce != PX_MAX_F32));
+	header->invMass0D0 = prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0;
+	header->invMass1D1 = prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1;
+
+
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS];
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS];
+	
+	Px1DConstraint* sorted[MAX_CONSTRAINT_ROWS];
+
+	preprocessRows(sorted, prepDesc.rows, angSqrtInvInertia0, angSqrtInvInertia1, prepDesc.numRows, *prepDesc.data0, *prepDesc.data1, prepDesc.mInvMassScales,
+		isExtended || prepDesc.disablePreprocessing, prepDesc.improvedSlerp);
+
+	const PxReal erp = 1.0f;
+	for (PxU32 i = 0; i<prepDesc.numRows; i++)
+	{
+		Ps::prefetchLine(constraints, 128);
+		SolverConstraint1D &s = *reinterpret_cast<SolverConstraint1D *>(constraints);
+		Px1DConstraint& c = *sorted[i];
+
+		PxReal driveScale = c.flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && prepDesc.driveLimitsAreForces ? PxMin(dt, 1.0f) : 1.0f;
+
+		PxReal unitResponse;
+		PxReal normalVel = 0.0f;
+		PxReal initVel = 0.f;
+
+		if(!isExtended)
+		{
+			init(s, c.linear0, c.linear1, PxVec3(angSqrtInvInertia0[i].x, angSqrtInvInertia0[i].y, angSqrtInvInertia0[i].z),
+				PxVec3(angSqrtInvInertia1[i].x, angSqrtInvInertia1[i].y, angSqrtInvInertia1[i].z), c.minImpulse * driveScale, c.maxImpulse * driveScale);
+			s.ang0Writeback = c.angular0;
+			PxReal resp0 = s.lin0.magnitudeSquared() * prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0 + s.ang0.magnitudeSquared() * prepDesc.mInvMassScales.angular0;
+			PxReal resp1 = s.lin1.magnitudeSquared() * prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1 + s.ang1.magnitudeSquared() * prepDesc.mInvMassScales.angular1;
+			unitResponse = resp0 + resp1;
+			initVel = normalVel = prepDesc.data0->projectVelocity(c.linear0, c.angular0) - prepDesc.data1->projectVelocity(c.linear1, c.angular1);
+		}
+		else
+		{
+			init(s, c.linear0, c.linear1, c.angular0, c.angular1, c.minImpulse * driveScale, c.maxImpulse * driveScale);
+			SolverConstraint1DExt& e = static_cast<SolverConstraint1DExt&>(s);
+
+			const SolverExtBody eb0(reinterpret_cast<const void*>(prepDesc.body0), prepDesc.data0, desc.linkIndexA);
+			const SolverExtBody eb1(reinterpret_cast<const void*>(prepDesc.body1), prepDesc.data1, desc.linkIndexB);
+
+			const Cm::SpatialVector resp0 = createImpulseResponseVector(e.lin0, e.ang0, eb0);
+			const Cm::SpatialVector resp1 = createImpulseResponseVector(-e.lin1, -e.ang1, eb1);
+			unitResponse = getImpulseResponse(eb0, resp0, unsimdRef(e.deltaVA), prepDesc.mInvMassScales.linear0, prepDesc.mInvMassScales.angular0,
+				eb1, resp1, unsimdRef(e.deltaVB), prepDesc.mInvMassScales.linear1, prepDesc.mInvMassScales.angular1, true);
+
+			s.ang0Writeback = c.angular0;
+			s.lin0 = resp0.linear;
+			s.ang0 = resp0.angular;
+			s.lin1 = -resp1.linear;
+			s.ang1 = -resp1.angular;
+			PxReal vel0, vel1;
+			if(needsNormalVel(c) || eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK || eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+			{
+				vel0 = eb0.projectVelocity(c.linear0, c.angular0);
+				vel1 = eb1.projectVelocity(c.linear1, c.angular1);
+
+				normalVel = vel0 - vel1;
+
+				//normalVel = eb0.projectVelocity(s.lin0, s.ang0) - eb1.projectVelocity(s.lin1, s.ang1);
+				if(eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+					initVel = vel0;
+				else if(eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+					initVel = -vel1;
+
+			}
+		}
+
+		setSolverConstants(s.constant, s.unbiasedConstant, s.velMultiplier, s.impulseMultiplier, 
+			c, normalVel, unitResponse, prepDesc.minResponseThreshold, erp, dt, invdt);
+
+		//s.targetVelocity = initVel;
+		const PxReal velBias = initVel * s.velMultiplier;
+		s.constant += velBias;
+		s.unbiasedConstant += velBias;
+
+		if(c.flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+			s.flags |= DY_SC_FLAG_OUTPUT_FORCE;
+
+		constraints += stride;
+	}
+
+	//KS - Set the solve count at the end to 0 
+	*(reinterpret_cast<PxU32*>(constraints)) = 0;
+	*(reinterpret_cast<PxU32*>(constraints + 4)) = 0;
+	PX_ASSERT(desc.constraint + getConstraintLength(desc) == constraints);
+	return prepDesc.numRows;
+}
+
+PxU32 SetupSolverConstraint(SolverConstraintShaderPrepDesc& shaderDesc,
+	PxSolverConstraintPrepDesc& prepDesc,
+	PxConstraintAllocator& allocator,
+	PxReal dt, PxReal invdt)
+{
+	// LL shouldn't see broken constraints
+	
+	PX_ASSERT(!(reinterpret_cast<ConstraintWriteback*>(prepDesc.writeback)->broken));
+
+	setConstraintLength(*prepDesc.desc, 0);
+
+	if (!shaderDesc.solverPrep)
+		return 0;
+
+	//PxU32 numAxisConstraints = 0;
+
+	Px1DConstraint rows[MAX_CONSTRAINT_ROWS];
+
+	// This is necessary so that there will be sensible defaults and shaders will
+	// continue to work (albeit with a recompile) if the row format changes.
+	// It's a bit inefficient because it fills in all constraint rows even if there
+	// is only going to be one generated. A way around this would be for the shader to
+	// specify the maximum number of rows it needs, or it could call a subroutine to
+	// prep the row before it starts filling it it.
+
+	PxMemZero(rows, sizeof(Px1DConstraint)*MAX_CONSTRAINT_ROWS);
+
+	for (PxU32 i = 0; i<MAX_CONSTRAINT_ROWS; i++)
+	{
+		Px1DConstraint& c = rows[i];
+		//Px1DConstraintInit(c);
+		c.minImpulse = -PX_MAX_REAL;
+		c.maxImpulse = PX_MAX_REAL;
+	}
+
+	prepDesc.mInvMassScales.linear0 = prepDesc.mInvMassScales.linear1 = prepDesc.mInvMassScales.angular0 = prepDesc.mInvMassScales.angular1 = 1.f;
+
+	PxVec3 body0WorldOffset(0.f);
+	PxU32 constraintCount = (*shaderDesc.solverPrep)(rows,
+		body0WorldOffset,
+		MAX_CONSTRAINT_ROWS,
+		prepDesc.mInvMassScales,
+		shaderDesc.constantBlock,
+		prepDesc.bodyFrame0, prepDesc.bodyFrame1);
+
+	prepDesc.rows = rows;
+	prepDesc.numRows = constraintCount;
+
+	prepDesc.body0WorldOffset = body0WorldOffset;
+
+	return ConstraintHelper::setupSolverConstraint(prepDesc, allocator, dt, invdt);
+}
+
+}
+
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetupBlock.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetupBlock.cpp
new file mode 100644
index 00000000..5c72f36e
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetupBlock.cpp
@@ -0,0 +1,535 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxMemory.h"
+#include "DyConstraintPrep.h"
+#include "PxsRigidBody.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraint1D4.h"
+#include "PsSort.h"
+#include "PxcConstraintBlockStream.h"
+#include "DyArticulationContactPrep.h"
+#include "PsFoundation.h"
+namespace physx
+{
+
+namespace Dy
+{
+
+void preprocessRows(Px1DConstraint** sorted, 
+					Px1DConstraint* rows,
+					PxVec4* angSqrtInvInertia0,
+					PxVec4* angSqrtInvInertia1,
+					PxU32 rowCount,
+					const PxSolverBodyData& bd0,
+					const PxSolverBodyData& bd1,
+					const PxConstraintInvMassScale& ims,
+					bool disablePreprocessing,
+					bool diagonalizeDrive);
+
+
+namespace
+{
+void setConstants(PxReal& constant, PxReal& unbiasedConstant, PxReal& velMultiplier, PxReal& impulseMultiplier,
+				  const Px1DConstraint& c, PxReal unitResponse, PxReal minRowResponse, PxReal erp, PxReal dt, PxReal recipdt,
+				  const PxSolverBodyData& b0, const PxSolverBodyData& b1, const bool finished)
+{
+	if(finished)
+	{
+		constant = 0.f;
+		unbiasedConstant = 0.f;
+		velMultiplier = 0.f;
+		impulseMultiplier = 0.f;
+		return;
+	}
+	PxReal nv = needsNormalVel(c) ? b0.projectVelocity(c.linear0, c.angular0) - b1.projectVelocity(c.linear1, c.angular1)
+								  : 0;
+	
+	setSolverConstants(constant, unbiasedConstant, velMultiplier, impulseMultiplier, 
+					   c, nv, unitResponse, minRowResponse, erp, dt, recipdt);
+}
+}
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+		(PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+		const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+		PxConstraintAllocator& allocator, PxU32 maxRows);
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+(SolverConstraintShaderPrepDesc* PX_RESTRICT constraintShaderDescs,
+PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+PxConstraintAllocator& allocator)
+
+{
+	//KS - we will never get here with constraints involving articulations so we don't need to stress about those in here
+
+	totalRows = 0;
+
+	Px1DConstraint allRows[MAX_CONSTRAINT_ROWS * 4];
+	
+	PxU32 numRows = 0;
+
+	PxU32 maxRows = 0;
+	PxU32 preppedIndex = 0;
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		Px1DConstraint* rows = allRows + numRows;
+		SolverConstraintShaderPrepDesc& shaderDesc = constraintShaderDescs[a];
+		PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+
+		if (!shaderDesc.solverPrep)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		PxMemZero(rows + preppedIndex, sizeof(Px1DConstraint)*(MAX_CONSTRAINT_ROWS));
+		for (PxU32 b = preppedIndex; b < MAX_CONSTRAINT_ROWS; ++b)
+		{
+			Px1DConstraint& c = rows[b];
+			//Px1DConstraintInit(c);
+			c.minImpulse = -PX_MAX_REAL;
+			c.maxImpulse = PX_MAX_REAL;
+		}
+
+		desc.mInvMassScales.linear0 = desc.mInvMassScales.linear1 = desc.mInvMassScales.angular0 = desc.mInvMassScales.angular1 = 1.f;
+
+		desc.body0WorldOffset = PxVec3(0.f);
+
+		PxU32 constraintCount = (*shaderDesc.solverPrep)(rows,
+			desc.body0WorldOffset,
+			MAX_CONSTRAINT_ROWS,
+			desc.mInvMassScales,
+			shaderDesc.constantBlock,
+			desc.bodyFrame0, desc.bodyFrame1);
+
+		preppedIndex = MAX_CONSTRAINT_ROWS - constraintCount;
+
+		maxRows = PxMax(constraintCount, maxRows);
+
+		if (constraintCount == 0)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		desc.rows = rows;
+		desc.numRows = constraintCount;
+		numRows += constraintCount;
+	}
+
+	return setupSolverConstraint4(constraintDescs, dt, recipdt, totalRows, allocator, maxRows);
+}
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+(PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+PxConstraintAllocator& allocator, PxU32 maxRows)
+{
+	const Vec4V zero = V4Zero();
+	Px1DConstraint* allSorted[MAX_CONSTRAINT_ROWS * 4];
+	PxU32 startIndex[4];
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS * 4];
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS * 4];
+
+	PxU32 numRows = 0;
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		startIndex[a] = numRows;
+		PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+		Px1DConstraint** sorted = allSorted + numRows;
+
+		preprocessRows(sorted, desc.rows, angSqrtInvInertia0 + numRows, angSqrtInvInertia1 + numRows, desc.numRows, *desc.data0, *desc.data1, desc.mInvMassScales,
+			desc.disablePreprocessing, desc.improvedSlerp);
+
+		numRows += desc.numRows;
+	}
+
+
+	PxU32 stride = sizeof(SolverConstraint1DDynamic4);
+
+	
+	const PxU32 constraintLength = sizeof(SolverConstraint1DHeader4) + stride * maxRows;
+
+	//KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we
+	//know SPU DMAs have completed)
+	PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u);
+	if(NULL == ptr || (reinterpret_cast<PxU8*>(-1))==ptr)
+	{
+		for(PxU32 a = 0; a < 4; ++a)
+		{
+			PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+			desc.desc->constraint = NULL;
+			setConstraintLength(*desc.desc, 0);
+			desc.desc->writeBack = desc.writeback;
+		}
+
+		if(NULL==ptr)
+		{
+			PX_WARN_ONCE(
+				"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+				"Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			return SolverConstraintPrepState::eOUT_OF_MEMORY;
+		}
+		else
+		{
+			PX_WARN_ONCE(
+				"Attempting to allocate more than 16K of constraint data. "
+				"Either accept joints detaching/exploding or simplify constraints.");
+			ptr=NULL;
+			return SolverConstraintPrepState::eOUT_OF_MEMORY;
+		}
+	}
+	//desc.constraint = ptr;
+
+	totalRows = numRows;
+
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+		desc.desc->constraint = ptr;
+		setConstraintLength(*desc.desc, constraintLength);
+		desc.desc->writeBack = desc.writeback;
+	}
+
+	const PxReal erp[4] = { 1.0f, 1.0f, 1.0f, 1.0f};
+	//OK, now we build all 4 constraints into a single set of rows
+
+	{
+		PxU8* currPtr = ptr;
+		SolverConstraint1DHeader4* header = reinterpret_cast<SolverConstraint1DHeader4*>(currPtr);
+		currPtr += sizeof(SolverConstraint1DHeader4);
+
+		const PxSolverBodyData& bd00 = *constraintDescs[0].data0;
+		const PxSolverBodyData& bd01 = *constraintDescs[1].data0;
+		const PxSolverBodyData& bd02 = *constraintDescs[2].data0;
+		const PxSolverBodyData& bd03 = *constraintDescs[3].data0;
+
+		const PxSolverBodyData& bd10 = *constraintDescs[0].data1;
+		const PxSolverBodyData& bd11 = *constraintDescs[1].data1;
+		const PxSolverBodyData& bd12 = *constraintDescs[2].data1;
+		const PxSolverBodyData& bd13 = *constraintDescs[3].data1;
+
+		//Load up masses, invInertia, velocity etc.
+
+		const Vec4V invMassScale0 = V4LoadXYZW(constraintDescs[0].mInvMassScales.linear0, constraintDescs[1].mInvMassScales.linear0, 
+			constraintDescs[2].mInvMassScales.linear0, constraintDescs[3].mInvMassScales.linear0);
+		const Vec4V invMassScale1 = V4LoadXYZW(constraintDescs[0].mInvMassScales.linear1, constraintDescs[1].mInvMassScales.linear1, 
+			constraintDescs[2].mInvMassScales.linear1, constraintDescs[3].mInvMassScales.linear1);
+
+
+		const Vec4V iMass0 = V4LoadXYZW(bd00.invMass, bd01.invMass, bd02.invMass, bd03.invMass);
+
+		const Vec4V iMass1 = V4LoadXYZW(bd10.invMass, bd11.invMass, bd12.invMass, bd13.invMass);
+
+		const Vec4V invMass0 = V4Mul(iMass0, invMassScale0);
+		const Vec4V invMass1 = V4Mul(iMass1, invMassScale1);
+
+
+		const Vec4V invInertiaScale0 = V4LoadXYZW(constraintDescs[0].mInvMassScales.angular0, constraintDescs[1].mInvMassScales.angular0, 
+			constraintDescs[2].mInvMassScales.angular0, constraintDescs[3].mInvMassScales.angular0);
+		const Vec4V invInertiaScale1 = V4LoadXYZW(constraintDescs[0].mInvMassScales.angular1, constraintDescs[1].mInvMassScales.angular1, 
+			constraintDescs[2].mInvMassScales.angular1, constraintDescs[3].mInvMassScales.angular1);
+
+		//Velocities
+		Vec4V linVel00 = V4LoadA(&bd00.linearVelocity.x);
+		Vec4V linVel01 = V4LoadA(&bd10.linearVelocity.x);
+		Vec4V angVel00 = V4LoadA(&bd00.angularVelocity.x);
+		Vec4V angVel01 = V4LoadA(&bd10.angularVelocity.x);
+
+		Vec4V linVel10 = V4LoadA(&bd01.linearVelocity.x);
+		Vec4V linVel11 = V4LoadA(&bd11.linearVelocity.x);
+		Vec4V angVel10 = V4LoadA(&bd01.angularVelocity.x);
+		Vec4V angVel11 = V4LoadA(&bd11.angularVelocity.x);
+
+		Vec4V linVel20 = V4LoadA(&bd02.linearVelocity.x);
+		Vec4V linVel21 = V4LoadA(&bd12.linearVelocity.x);
+		Vec4V angVel20 = V4LoadA(&bd02.angularVelocity.x);
+		Vec4V angVel21 = V4LoadA(&bd12.angularVelocity.x);
+
+		Vec4V linVel30 = V4LoadA(&bd03.linearVelocity.x);
+		Vec4V linVel31 = V4LoadA(&bd13.linearVelocity.x);
+		Vec4V angVel30 = V4LoadA(&bd03.angularVelocity.x);
+		Vec4V angVel31 = V4LoadA(&bd13.angularVelocity.x);
+
+
+		Vec4V linVel0T0, linVel0T1, linVel0T2;
+		Vec4V linVel1T0, linVel1T1, linVel1T2;
+		Vec4V angVel0T0, angVel0T1, angVel0T2;
+		Vec4V angVel1T0, angVel1T1, angVel1T2;
+
+
+		PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2);
+		PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2);
+		PX_TRANSPOSE_44_34(angVel00, angVel10, angVel20, angVel30, angVel0T0, angVel0T1, angVel0T2);
+		PX_TRANSPOSE_44_34(angVel01, angVel11, angVel21, angVel31, angVel1T0, angVel1T1, angVel1T2);
+
+
+
+		//body world offsets
+		Vec4V workOffset0 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[0].body0WorldOffset));
+		Vec4V workOffset1 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[1].body0WorldOffset));
+		Vec4V workOffset2 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[2].body0WorldOffset));
+		Vec4V workOffset3 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[3].body0WorldOffset));
+
+		Vec4V workOffsetX, workOffsetY, workOffsetZ;
+
+		PX_TRANSPOSE_44_34(workOffset0, workOffset1, workOffset2, workOffset3, workOffsetX, workOffsetY, workOffsetZ);
+
+		const FloatV dtV = FLoad(dt);
+		Vec4V linBreakForce = V4LoadXYZW(constraintDescs[0].linBreakForce, constraintDescs[1].linBreakForce,
+			constraintDescs[2].linBreakForce, constraintDescs[3].linBreakForce);
+		Vec4V angBreakForce = V4LoadXYZW(constraintDescs[0].angBreakForce, constraintDescs[1].angBreakForce,
+			constraintDescs[2].angBreakForce, constraintDescs[3].angBreakForce);
+
+		
+		header->break0 = PxU8((constraintDescs[0].linBreakForce != PX_MAX_F32) || (constraintDescs[0].angBreakForce != PX_MAX_F32));
+		header->break1 = PxU8((constraintDescs[1].linBreakForce != PX_MAX_F32) || (constraintDescs[1].angBreakForce != PX_MAX_F32));
+		header->break2 = PxU8((constraintDescs[2].linBreakForce != PX_MAX_F32) || (constraintDescs[2].angBreakForce != PX_MAX_F32));
+		header->break3 = PxU8((constraintDescs[3].linBreakForce != PX_MAX_F32) || (constraintDescs[3].angBreakForce != PX_MAX_F32));
+
+
+		//OK, I think that's everything loaded in
+
+		header->invMass0D0 = invMass0;
+		header->invMass1D1 = invMass1;
+		header->angD0 = invInertiaScale0;
+		header->angD1 = invInertiaScale1;
+		header->body0WorkOffsetX = workOffsetX;
+		header->body0WorkOffsetY = workOffsetY;
+		header->body0WorkOffsetZ = workOffsetZ;
+
+		header->count = maxRows;
+		header->type = DY_SC_TYPE_BLOCK_1D;
+		header->linBreakImpulse = V4Scale(linBreakForce, dtV);
+		header->angBreakImpulse = V4Scale(angBreakForce, dtV);
+		header->count0 = Ps::to8(constraintDescs[0].numRows);
+		header->count1 = Ps::to8(constraintDescs[1].numRows);
+		header->count2 = Ps::to8(constraintDescs[2].numRows);
+		header->count3 = Ps::to8(constraintDescs[3].numRows);
+
+		//Now we loop over the constraints and build the results...
+
+		PxU32 index0 = 0;
+		PxU32 endIndex0 = constraintDescs[0].numRows - 1;
+		PxU32 index1 = startIndex[1];
+		PxU32 endIndex1 = index1 + constraintDescs[1].numRows - 1;
+		PxU32 index2 = startIndex[2];
+		PxU32 endIndex2 = index2 + constraintDescs[2].numRows - 1;
+		PxU32 index3 = startIndex[3];
+		PxU32 endIndex3 = index3 + constraintDescs[3].numRows - 1;
+
+		const FloatV one = FOne();
+
+		for(PxU32 a = 0; a < maxRows; ++a)
+		{	
+			SolverConstraint1DDynamic4* c = reinterpret_cast<SolverConstraint1DDynamic4*>(currPtr);
+			currPtr += stride;
+
+			Px1DConstraint* con0 = allSorted[index0];
+			Px1DConstraint* con1 = allSorted[index1];
+			Px1DConstraint* con2 = allSorted[index2];
+			Px1DConstraint* con3 = allSorted[index3];
+
+			Vec4V cangDelta00 = V4LoadA(&angSqrtInvInertia0[index0].x);
+			Vec4V cangDelta01 = V4LoadA(&angSqrtInvInertia0[index1].x);
+			Vec4V cangDelta02 = V4LoadA(&angSqrtInvInertia0[index2].x);
+			Vec4V cangDelta03 = V4LoadA(&angSqrtInvInertia0[index3].x);
+
+			Vec4V cangDelta10 = V4LoadA(&angSqrtInvInertia1[index0].x);
+			Vec4V cangDelta11 = V4LoadA(&angSqrtInvInertia1[index1].x);
+			Vec4V cangDelta12 = V4LoadA(&angSqrtInvInertia1[index2].x);
+			Vec4V cangDelta13 = V4LoadA(&angSqrtInvInertia1[index3].x);
+
+			index0 = index0 == endIndex0 ? index0 : index0 + 1;
+			index1 = index1 == endIndex1 ? index1 : index1 + 1;
+			index2 = index2 == endIndex2 ? index2 : index2 + 1;
+			index3 = index3 == endIndex3 ? index3 : index3 + 1;
+
+			Vec4V driveScale = V4Splat(one);
+			if (con0->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[0].driveLimitsAreForces)
+				driveScale = V4SetX(driveScale, FMin(one, dtV));
+			if (con1->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[1].driveLimitsAreForces)
+				driveScale = V4SetY(driveScale, FMin(one, dtV));
+			if (con2->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[2].driveLimitsAreForces)
+				driveScale = V4SetZ(driveScale, FMin(one, dtV));
+			if (con3->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[3].driveLimitsAreForces)
+				driveScale = V4SetW(driveScale, FMin(one, dtV));
+
+
+			Vec4V clin00 = V4LoadA(&con0->linear0.x);
+			Vec4V clin01 = V4LoadA(&con1->linear0.x);
+			Vec4V clin02 = V4LoadA(&con2->linear0.x);
+			Vec4V clin03 = V4LoadA(&con3->linear0.x);
+
+			Vec4V cang00 = V4LoadA(&con0->angular0.x);
+			Vec4V cang01 = V4LoadA(&con1->angular0.x);
+			Vec4V cang02 = V4LoadA(&con2->angular0.x);
+			Vec4V cang03 = V4LoadA(&con3->angular0.x);
+
+			Vec4V clin0X, clin0Y, clin0Z;
+			Vec4V cang0X, cang0Y, cang0Z;
+			
+			PX_TRANSPOSE_44_34(clin00, clin01, clin02, clin03, clin0X, clin0Y, clin0Z);
+			PX_TRANSPOSE_44_34(cang00, cang01, cang02, cang03, cang0X, cang0Y, cang0Z);
+			
+			const Vec4V maxImpulse = V4LoadXYZW(con0->maxImpulse, con1->maxImpulse, con2->maxImpulse, con3->maxImpulse);
+			const Vec4V minImpulse = V4LoadXYZW(con0->minImpulse, con1->minImpulse, con2->minImpulse, con3->minImpulse);
+
+			Vec4V angDelta0X, angDelta0Y, angDelta0Z;
+
+			PX_TRANSPOSE_44_34(cangDelta00, cangDelta01, cangDelta02, cangDelta03, angDelta0X, angDelta0Y, angDelta0Z);
+
+			c->flags[0] = 0;
+			c->flags[1] = 0;
+			c->flags[2] = 0;
+			c->flags[3] = 0;
+
+			c->lin0X = clin0X;
+			c->lin0Y = clin0Y;
+			c->lin0Z = clin0Z;
+			c->ang0X = angDelta0X;
+			c->ang0Y = angDelta0Y;
+			c->ang0Z = angDelta0Z;
+			c->ang0WritebackX = cang0X;
+			c->ang0WritebackY = cang0Y;
+			c->ang0WritebackZ = cang0Z;
+
+			c->minImpulse = V4Mul(minImpulse, driveScale);
+			c->maxImpulse = V4Mul(maxImpulse, driveScale);
+			c->appliedForce = zero;
+
+			const Vec4V lin0MagSq = V4MulAdd(clin0Z, clin0Z, V4MulAdd(clin0Y, clin0Y, V4Mul(clin0X, clin0X)));
+			const Vec4V cang0DotAngDelta = V4MulAdd(angDelta0Z, angDelta0Z, V4MulAdd(angDelta0Y, angDelta0Y, V4Mul(angDelta0X, angDelta0X)));
+			c->flags[0] = 0;
+			c->flags[1] = 0;
+			c->flags[2] = 0;
+			c->flags[3] = 0;
+
+			Vec4V unitResponse = V4MulAdd(lin0MagSq, invMass0, V4Mul(cang0DotAngDelta, invInertiaScale0));
+
+			Vec4V clin10 = V4LoadA(&con0->linear1.x);
+			Vec4V clin11 = V4LoadA(&con1->linear1.x);
+			Vec4V clin12 = V4LoadA(&con2->linear1.x);
+			Vec4V clin13 = V4LoadA(&con3->linear1.x);
+
+			Vec4V cang10 = V4LoadA(&con0->angular1.x);
+			Vec4V cang11 = V4LoadA(&con1->angular1.x);
+			Vec4V cang12 = V4LoadA(&con2->angular1.x);
+			Vec4V cang13 = V4LoadA(&con3->angular1.x);
+
+			Vec4V clin1X, clin1Y, clin1Z;
+			Vec4V cang1X, cang1Y, cang1Z;
+			PX_TRANSPOSE_44_34(clin10, clin11, clin12, clin13, clin1X, clin1Y, clin1Z);
+			PX_TRANSPOSE_44_34(cang10, cang11, cang12, cang13, cang1X, cang1Y, cang1Z);
+
+			Vec4V angDelta1X, angDelta1Y, angDelta1Z;
+
+			PX_TRANSPOSE_44_34(cangDelta10, cangDelta11, cangDelta12, cangDelta13, angDelta1X, angDelta1Y, angDelta1Z);
+
+			const Vec4V lin1MagSq = V4MulAdd(clin1Z, clin1Z, V4MulAdd(clin1Y, clin1Y, V4Mul(clin1X, clin1X)));
+			const Vec4V cang1DotAngDelta = V4MulAdd(angDelta1Z, angDelta1Z, V4MulAdd(angDelta1Y, angDelta1Y, V4Mul(angDelta1X, angDelta1X)));
+
+			c->lin1X = clin1X;
+			c->lin1Y = clin1Y;
+			c->lin1Z = clin1Z;
+
+			c->ang1X = angDelta1X;
+			c->ang1Y = angDelta1Y;
+			c->ang1Z = angDelta1Z;
+
+			unitResponse = V4Add(unitResponse, V4MulAdd(lin1MagSq, invMass1, V4Mul(cang1DotAngDelta, invInertiaScale1)));
+
+			Vec4V linProj0(V4Mul(clin0X, linVel0T0));
+			Vec4V linProj1(V4Mul(clin1X, linVel1T0));
+			Vec4V angProj0(V4Mul(cang0X, angVel0T0));
+			Vec4V angProj1(V4Mul(cang1X, angVel1T0));
+
+			linProj0 = V4MulAdd(clin0Y, linVel0T1, linProj0);
+			linProj1 = V4MulAdd(clin1Y, linVel1T1, linProj1);
+			angProj0 = V4MulAdd(cang0Y, angVel0T1, angProj0);
+			angProj1 = V4MulAdd(cang1Y, angVel1T1, angProj1);
+			
+			linProj0 = V4MulAdd(clin0Z, linVel0T2, linProj0);
+			linProj1 = V4MulAdd(clin1Z, linVel1T2, linProj1);
+			angProj0 = V4MulAdd(cang0Z, angVel0T2, angProj0);
+			angProj1 = V4MulAdd(cang1Z, angVel1T2, angProj1);
+
+			const Vec4V projectVel0 = V4Add(linProj0, angProj0);
+			const Vec4V projectVel1 = V4Add(linProj1, angProj1);
+			
+			const Vec4V normalVel = V4Sub(projectVel0, projectVel1);
+
+
+			{
+				const PxVec4& ur				= reinterpret_cast<const PxVec4&>(unitResponse);
+				PxVec4& cConstant				= reinterpret_cast<PxVec4&>(c->constant);
+				PxVec4& cUnbiasedConstant		= reinterpret_cast<PxVec4&>(c->unbiasedConstant);
+				PxVec4& cVelMultiplier			= reinterpret_cast<PxVec4&>(c->velMultiplier);
+				PxVec4& cImpulseMultiplier		= reinterpret_cast<PxVec4&>(c->impulseMultiplier);
+
+				setConstants(cConstant.x, cUnbiasedConstant.x, cVelMultiplier.x, cImpulseMultiplier.x, 
+							 *con0, ur.x, constraintDescs[0].minResponseThreshold, erp[0], dt, recipdt, 
+							 *constraintDescs[0].data0, *constraintDescs[0].data1, a >= constraintDescs[0].numRows);
+
+				setConstants(cConstant.y, cUnbiasedConstant.y, cVelMultiplier.y, cImpulseMultiplier.y, 
+							 *con1, ur.y, constraintDescs[1].minResponseThreshold, erp[1], dt, recipdt, 
+							 *constraintDescs[1].data0, *constraintDescs[1].data1, a >= constraintDescs[1].numRows);
+				
+				setConstants(cConstant.z, cUnbiasedConstant.z, cVelMultiplier.z, cImpulseMultiplier.z, 
+							 *con2, ur.z, constraintDescs[2].minResponseThreshold, erp[2], dt, recipdt, 
+							 *constraintDescs[2].data0, *constraintDescs[2].data1, a >= constraintDescs[2].numRows);
+
+				setConstants(cConstant.w, cUnbiasedConstant.w, cVelMultiplier.w, cImpulseMultiplier.w, 
+							 *con3, ur.w, constraintDescs[3].minResponseThreshold, erp[3], dt, recipdt, 
+							 *constraintDescs[3].data0, *constraintDescs[3].data1, a >= constraintDescs[3].numRows);
+			}
+
+			const Vec4V velBias = V4Mul(c->velMultiplier, normalVel);
+			c->constant = V4Add(c->constant, velBias);
+			c->unbiasedConstant = V4Add(c->unbiasedConstant, velBias);
+
+			if(con0->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[0] |= DY_SC_FLAG_OUTPUT_FORCE;
+			if(con1->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[1] |= DY_SC_FLAG_OUTPUT_FORCE;
+			if(con2->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[2] |= DY_SC_FLAG_OUTPUT_FORCE;
+			if(con3->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[3] |= DY_SC_FLAG_OUTPUT_FORCE;
+		}
+		*(reinterpret_cast<PxU32*>(currPtr)) = 0;
+		*(reinterpret_cast<PxU32*>(currPtr + 4)) = 0;
+	}
+	
+	//OK, we're ready to allocate and solve prep these constraints now :-)
+	return SolverConstraintPrepState::eSUCCESS;
+}
+
+}
+
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep.cpp
new file mode 100644
index 00000000..1e21f1e3
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep.cpp
@@ -0,0 +1,725 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+     
+#include "foundation/PxPreprocessor.h"
+#include "PxSceneDesc.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DySolverContact.h"
+#include "DySolverContact4.h"
+#include "DySolverConstraintTypes.h"
+#include "PxcNpWorkUnit.h"
+#include "DyThreadContext.h"
+#include "DyContactPrep.h"
+#include "PxcNpContactPrepShared.h"
+#include "PxvDynamics.h"
+#include "DyCorrelationBuffer.h"
+#include "DyDynamics.h"
+#include "DyArticulationContactPrep.h"
+#include "PxsContactManager.h"
+#include "PsFoundation.h"
+
+using namespace physx;
+using namespace Gu;
+
+
+#include "PsVecMath.h"
+#include "PxContactModifyCallback.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DyContactPrepShared.h"
+
+using namespace Ps::aos;
+
+namespace physx
+{
+namespace Dy
+{
+
+PxcCreateFinalizeSolverContactMethod createFinalizeMethods[3] =
+{
+	createFinalizeSolverContacts,
+	createFinalizeSolverContactsCoulomb1D,
+	createFinalizeSolverContactsCoulomb2D
+};
+
+
+
+static void setupFinalizeSolverConstraints(Sc::ShapeInteraction* shapeInteraction,
+						    const ContactPoint* buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							const PxSolverBodyData& data0,
+							const PxSolverBodyData& data1,
+							const PxReal invDtF32,
+							PxReal bounceThresholdF32,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1, 
+							bool hasForceThreshold, bool staticOrKinematicBody,
+							const PxReal restDist, PxU8* frictionDataPtr,
+							const PxReal maxCCDSeparation)	
+{
+	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
+	// therefore did not get written back to the cache) but the patch addresses are dense,
+	// corresponding to valid patches
+
+	const FloatV ccdMaxSeparation = FLoad(maxCCDSeparation);
+
+	PxU8 flags = PxU8(hasForceThreshold ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	PxU8 type = Ps::to8(staticOrKinematicBody ? DY_SC_TYPE_STATIC_CONTACT
+									                 : DY_SC_TYPE_RB_CONTACT);
+
+	const FloatV zero=FZero();
+
+	const FloatV d0 = FLoad(invMassScale0);
+	const FloatV d1 = FLoad(invMassScale1);
+	const FloatV angD0 = FLoad(invInertiaScale0);
+	const FloatV angD1 = FLoad(invInertiaScale1);
+	
+	const FloatV nDom1fV = FNeg(d1);
+
+	const FloatV invMass0 = FLoad(data0.invMass);
+	const FloatV invMass1 = FLoad(data1.invMass);
+
+	const FloatV invMass0_dom0fV = FMul(d0, invMass0);
+	const FloatV invMass1_dom1fV = FMul(nDom1fV, invMass1);
+
+
+	Vec4V staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = V4Zero();
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, invMass0_dom0fV);
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, invMass1_dom1fV);
+
+	const FloatV restDistance = FLoad(restDist); 
+
+	const FloatV maxPenBias = FMax(FLoad(data0.penBiasClamp), FLoad(data1.penBiasClamp));
+
+	const QuatV bodyFrame0q = QuatVLoadU(&bodyFrame0.q.x);
+	const Vec3V bodyFrame0p = V3LoadU(bodyFrame0.p);
+	
+	const QuatV bodyFrame1q = QuatVLoadU(&bodyFrame1.q.x);
+	const Vec3V bodyFrame1p = V3LoadU(bodyFrame1.p);
+
+	PxU32 frictionPatchWritebackAddrIndex = 0;
+	PxU32 contactWritebackCount = 0;
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	const Vec3V linVel0 = V3LoadU_SafeReadW(data0.linearVelocity);	// PT: safe because 'invMass' follows 'initialLinVel' in PxSolverBodyData
+	const Vec3V linVel1 = V3LoadU_SafeReadW(data1.linearVelocity);	// PT: safe because 'invMass' follows 'initialLinVel' in PxSolverBodyData
+	const Vec3V angVel0 = V3LoadU_SafeReadW(data0.angularVelocity);	// PT: safe because 'reportThreshold' follows 'initialAngVel' in PxSolverBodyData
+	const Vec3V angVel1 = V3LoadU_SafeReadW(data1.angularVelocity);	// PT: safe because 'reportThreshold' follows 'initialAngVel' in PxSolverBodyData
+
+	PX_ALIGN(16, const Mat33V invSqrtInertia0)
+	(
+		V3LoadU_SafeReadW(data0.sqrtInvInertia.column0),	// PT: safe because 'column1' follows 'column0' in PxMat33
+		V3LoadU_SafeReadW(data0.sqrtInvInertia.column1),	// PT: safe because 'column2' follows 'column1' in PxMat33
+		V3LoadU(data0.sqrtInvInertia.column2)
+	);
+	
+	PX_ALIGN(16, const Mat33V invSqrtInertia1)
+	(
+		V3LoadU_SafeReadW(data1.sqrtInvInertia.column0),	// PT: safe because 'column1' follows 'column0' in PxMat33
+		V3LoadU_SafeReadW(data1.sqrtInvInertia.column1),	// PT: safe because 'column2' follows 'column1' in PxMat33
+		V3LoadU(data1.sqrtInvInertia.column2)
+	);
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+
+	for(PxU32 i=0;i<c.frictionPatchCount;i++)
+	{
+		PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+		PX_ASSERT(frictionPatch.anchorCount <= 2);
+
+		PxU32 firstPatch = c.correlationListHeads[i];
+		const Gu::ContactPoint* contactBase0 = buffer + c.contactPatches[firstPatch].start;
+
+		const PxReal combinedRestitution = contactBase0->restitution;
+		
+		SolverContactHeader* PX_RESTRICT header = reinterpret_cast<SolverContactHeader*>(ptr);
+		ptr += sizeof(SolverContactHeader);		
+
+
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+
+		header->shapeInteraction = shapeInteraction;
+		header->flags = flags;
+		FStore(invMass0_dom0fV, &header->invMass0);
+		FStore(FNeg(invMass1_dom1fV), &header->invMass1);
+		const FloatV restitution = FLoad(combinedRestitution);
+	
+		PxU32 pointStride = sizeof(SolverContactPoint);
+		PxU32 frictionStride = sizeof(SolverContactFriction);
+
+		const Vec3V normal = V3LoadA(buffer[c.contactPatches[c.correlationListHeads[i]].start].normal);
+		const FloatV normalLenSq = V3LengthSq(normal);
+		const VecCrossV norCross = V3PrepareCross(normal);
+		const FloatV norVel = V3SumElems(V3NegMulSub(normal, linVel1, V3Mul(normal, linVel0)));
+
+		const FloatV invMassNorLenSq0 = FMul(invMass0_dom0fV, normalLenSq);
+		const FloatV invMassNorLenSq1 = FMul(invMass1_dom1fV, normalLenSq);
+
+		header->normal = normal;
+		
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer + c.contactPatches[patch].start;
+				
+			PxU8* p = ptr;
+			
+			for(PxU32 j=0;j<count;j++)
+			{
+				Ps::prefetchLine(p, 256);
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPoint* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPoint*>(p);
+				p += pointStride;
+
+				constructContactConstraint(invSqrtInertia0, invSqrtInertia1, invMassNorLenSq0, 
+					invMassNorLenSq1, angD0, angD1, bodyFrame0p, bodyFrame1p,
+					normal, norVel, norCross, angVel0, angVel1,
+					invDt, invDtp8, restDistance, maxPenBias,  restitution,
+					bounceThreshold, contact, *solverContact,
+					ccdMaxSeparation);
+			}
+
+			ptr = p;
+		}
+		contactWritebackCount += contactCount;
+
+		PxF32* forceBuffers = reinterpret_cast<PxF32*>(ptr);
+		PxMemZero(forceBuffers, sizeof(PxF32) * contactCount);
+		ptr += ((contactCount + 3) & (~3)) * sizeof(PxF32); // jump to next 16-byte boundary
+
+		const PxReal staticFriction = contactBase0->staticFriction;
+		const PxReal dynamicFriction = contactBase0->dynamicFriction;
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(staticFriction));
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(dynamicFriction));
+
+		const bool haveFriction = (disableStrongFriction == 0 && frictionPatch.anchorCount != 0) ;//PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;
+		header->numNormalConstr		= Ps::to8(contactCount);
+		header->numFrictionConstr	= Ps::to8(haveFriction ? frictionPatch.anchorCount*2 : 0);
+	
+		header->type				= type;
+
+		header->staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W;
+		FStore(angD0, &header->angDom0);
+		FStore(angD1, &header->angDom1);
+
+		header->broken = 0;
+
+		if(haveFriction)
+		{
+			const Vec3V linVrel = V3Sub(linVel0, linVel1);
+			//const Vec3V normal = Vec3V_From_PxVec3_Aligned(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);
+
+			const FloatV orthoThreshold = FLoad(0.70710678f);
+			const FloatV p1 = FLoad(0.1f);
+			// fallback: normal.cross((1,0,0)) or normal.cross((0,0,1))
+			const FloatV normalX = V3GetX(normal);
+			const FloatV normalY = V3GetY(normal);
+			const FloatV normalZ = V3GetZ(normal);
+			
+			Vec3V t0Fallback1 = V3Merge(zero, FNeg(normalZ), normalY);
+			Vec3V t0Fallback2 = V3Merge(FNeg(normalY), normalX, zero) ;
+			Vec3V t0Fallback = V3Sel(FIsGrtr(orthoThreshold, FAbs(normalX)), t0Fallback1, t0Fallback2);
+
+			Vec3V t0 = V3Sub(linVrel, V3Scale(normal, V3Dot(normal, linVrel)));
+			t0 = V3Sel(FIsGrtr(V3LengthSq(t0), p1), t0, t0Fallback);
+			t0 = V3Normalize(t0);
+
+			const VecCrossV t0Cross = V3PrepareCross(t0);
+
+			const Vec3V t1 = V3Cross(norCross, t0Cross);
+			const VecCrossV t1Cross = V3PrepareCross(t1);
+
+			
+			// since we don't even have the body velocities we can't compute the tangent dirs, so 
+			// the only thing we can do right now is to write the geometric information (which is the
+			// same for both axis constraints of an anchor) We put ra in the raXn field, rb in the rbXn
+			// field, and the error in the normal field. See corresponding comments in
+			// completeContactFriction()
+
+			//We want to set the writeBack ptr to point to the broken flag of the friction patch.
+			//On spu we have a slight problem here because the friction patch array is 
+			//in local store rather than in main memory. The good news is that the address of the friction 
+			//patch array in main memory is stored in the work unit. These two addresses will be equal 
+			//except on spu where one is local store memory and the other is the effective address in main memory.
+			//Using the value stored in the work unit guarantees that the main memory address is used on all platforms.
+			PxU8* PX_RESTRICT writeback = frictionDataPtr + frictionPatchWritebackAddrIndex*sizeof(FrictionPatch);
+
+			header->frictionBrokenWritebackByte = writeback;
+
+			for(PxU32 j = 0; j < frictionPatch.anchorCount; j++)
+			{
+				Ps::prefetchLine(ptr, 256);
+				Ps::prefetchLine(ptr, 384);
+				SolverContactFriction* PX_RESTRICT f0 = reinterpret_cast<SolverContactFriction*>(ptr);
+				ptr += frictionStride;
+				SolverContactFriction* PX_RESTRICT f1 = reinterpret_cast<SolverContactFriction*>(ptr);
+				ptr += frictionStride;
+
+				Vec3V body0Anchor = V3LoadU(frictionPatch.body0Anchors[j]);
+				Vec3V body1Anchor = V3LoadU(frictionPatch.body1Anchors[j]);
+
+				Vec3V ra = QuatRotate(bodyFrame0q, body0Anchor);
+				Vec3V rb = QuatRotate(bodyFrame1q, body1Anchor);
+				Vec3V error =V3Sub(V3Add(ra, bodyFrame0p), V3Add(rb, bodyFrame1p));
+
+				const PxU32 index = c.contactPatches[c.correlationListHeads[i]].start;
+				const Vec3V tvel = V3LoadA(buffer[index].targetVel);
+				
+				{
+					const Vec3V raXn = V3Cross(ra, t0Cross);
+					const Vec3V rbXn = V3Cross(rb, t0Cross);
+
+					const Vec3V raXnSqrtInertia = M33MulV3(invSqrtInertia0, raXn);
+					const Vec3V rbXnSqrtInertia = M33MulV3(invSqrtInertia1, rbXn);	
+
+
+					const FloatV resp0 = FAdd(invMass0_dom0fV, FMul(angD0, V3Dot(raXnSqrtInertia, raXnSqrtInertia)));
+					const FloatV resp1 = FSub(FMul(angD1, V3Dot(rbXnSqrtInertia, rbXnSqrtInertia)), invMass1_dom1fV);
+					const FloatV resp = FAdd(resp0, resp1);
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, zero), FDiv(p8, resp), zero);
+
+					FloatV targetVel = V3Dot(tvel, t0);
+
+					const FloatV vrel1 = FAdd(V3Dot(t0, linVel0), V3Dot(raXn, angVel0));
+					const FloatV vrel2 = FAdd(V3Dot(t0, linVel1), V3Dot(rbXn, angVel1));
+					const FloatV vrel = FSub(vrel1, vrel2);
+
+					targetVel = FSub(targetVel, vrel);
+
+					f0->normalXYZ_appliedForceW = V4SetW(t0, zero);
+					f0->raXnXYZ_velMultiplierW = V4SetW(raXnSqrtInertia, velMultiplier);
+					f0->rbXnXYZ_biasW = V4SetW(rbXnSqrtInertia, FMul(V3Dot(t0, error), invDt));
+					FStore(targetVel, &f0->targetVel);
+				}
+
+				{
+
+					const Vec3V raXn = V3Cross(ra, t1Cross);
+					const Vec3V rbXn = V3Cross(rb, t1Cross);
+
+					const Vec3V raXnSqrtInertia = M33MulV3(invSqrtInertia0, raXn);
+					const Vec3V rbXnSqrtInertia = M33MulV3(invSqrtInertia1, rbXn);	
+
+					const FloatV resp0 = FAdd(invMass0_dom0fV, FMul(angD0, V3Dot(raXnSqrtInertia, raXnSqrtInertia)));
+					const FloatV resp1 = FSub(FMul(angD1, V3Dot(rbXnSqrtInertia, rbXnSqrtInertia)), invMass1_dom1fV);
+					const FloatV resp = FAdd(resp0, resp1);
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, zero), FDiv(p8, resp), zero);
+
+					FloatV targetVel = V3Dot(tvel, t1);
+
+					const FloatV vrel1 = FAdd(V3Dot(t1, linVel0), V3Dot(raXn, angVel0));
+					const FloatV vrel2 = FAdd(V3Dot(t1, linVel1), V3Dot(rbXn, angVel1));
+					const FloatV vrel = FSub(vrel1, vrel2);
+
+					targetVel = FSub(targetVel, vrel);
+
+					f1->normalXYZ_appliedForceW = V4SetW(t1, zero);
+					f1->raXnXYZ_velMultiplierW = V4SetW(raXnSqrtInertia, velMultiplier);
+					f1->rbXnXYZ_biasW = V4SetW(rbXnSqrtInertia, FMul(V3Dot(t1, error), invDt));
+					FStore(targetVel, &f1->targetVel);
+				}
+			}
+		}
+
+		frictionPatchWritebackAddrIndex++;
+	}
+}
+
+
+PX_FORCE_INLINE void computeBlockStreamByteSizes(const bool useExtContacts, const CorrelationBuffer& c,
+								PxU32& _solverConstraintByteSize, PxU32& _frictionPatchByteSize, PxU32& _numFrictionPatches,
+								PxU32& _axisConstraintCount)
+{
+	PX_ASSERT(0 == _solverConstraintByteSize);
+	PX_ASSERT(0 == _frictionPatchByteSize);
+	PX_ASSERT(0 == _numFrictionPatches);
+	PX_ASSERT(0 == _axisConstraintCount);
+
+	// PT: use local vars to remove LHS
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 numFrictionPatches = 0;
+	PxU32 axisConstraintCount = 0;
+
+	
+	for(PxU32 i = 0; i < c.frictionPatchCount; i++)
+	{
+		//Friction patches.
+		if(c.correlationListHeads[i] != CorrelationBuffer::LIST_END)
+			numFrictionPatches++;
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+
+		const bool haveFriction = (frictionPatch.materialFlags & PxMaterialFlag::eDISABLE_FRICTION) == 0;
+
+		//Solver constraint data.
+		if(c.frictionPatchContactCounts[i]!=0)
+		{
+			solverConstraintByteSize += sizeof(SolverContactHeader);
+			solverConstraintByteSize += useExtContacts ? c.frictionPatchContactCounts[i] * sizeof(SolverContactPointExt) 
+				: c.frictionPatchContactCounts[i] * sizeof(SolverContactPoint);
+			solverConstraintByteSize += sizeof(PxF32) * ((c.frictionPatchContactCounts[i] + 3)&(~3)); //Add on space for applied impulses
+
+			axisConstraintCount += c.frictionPatchContactCounts[i];
+
+			if(haveFriction)
+			{
+				solverConstraintByteSize += useExtContacts ? c.frictionPatches[i].anchorCount * 2 * sizeof(SolverContactFrictionExt)
+					: c.frictionPatches[i].anchorCount * 2 * sizeof(SolverContactFriction);
+				axisConstraintCount += c.frictionPatches[i].anchorCount * 2;
+
+			}
+		}
+	}
+	PxU32 frictionPatchByteSize = numFrictionPatches*sizeof(FrictionPatch);
+
+	_numFrictionPatches = numFrictionPatches;
+	_axisConstraintCount = axisConstraintCount;
+
+	//16-byte alignment.
+	_frictionPatchByteSize = ((frictionPatchByteSize + 0x0f) & ~0x0f);
+	_solverConstraintByteSize =  ((solverConstraintByteSize + 0x0f) & ~0x0f);
+	PX_ASSERT(0 == (_solverConstraintByteSize & 0x0f));
+	PX_ASSERT(0 == (_frictionPatchByteSize & 0x0f));
+}
+
+static bool reserveBlockStreams(const bool useExtContacts, Dy::CorrelationBuffer& cBuffer,
+						PxU8*& solverConstraint,
+						FrictionPatch*& _frictionPatches,
+						PxU32& numFrictionPatches, PxU32& solverConstraintByteSize,
+						PxU32& axisConstraintCount, PxConstraintAllocator& constraintAllocator)
+{
+	PX_ASSERT(NULL == solverConstraint);
+	PX_ASSERT(NULL == _frictionPatches);
+	PX_ASSERT(0 == numFrictionPatches);
+	PX_ASSERT(0 == solverConstraintByteSize);
+	PX_ASSERT(0 == axisConstraintCount);
+
+	//From frictionPatchStream we just need to reserve a single buffer.
+	PxU32 frictionPatchByteSize = 0;
+	//Compute the sizes of all the buffers.
+	computeBlockStreamByteSizes(
+		useExtContacts, cBuffer,
+		solverConstraintByteSize, frictionPatchByteSize, numFrictionPatches,
+		axisConstraintCount);
+
+	//Reserve the buffers.
+
+	//First reserve the accumulated buffer size for the constraint block.
+	PxU8* constraintBlock = NULL;
+	const PxU32 constraintBlockByteSize = solverConstraintByteSize;
+	if(constraintBlockByteSize > 0)
+	{
+		constraintBlock = constraintAllocator.reserveConstraintData(constraintBlockByteSize + 16u);
+
+		if(0==constraintBlock || (reinterpret_cast<PxU8*>(-1))==constraintBlock)
+		{
+			if(0==constraintBlock)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of contact data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				constraintBlock=NULL;
+			}
+		}
+	}
+
+	FrictionPatch* frictionPatches = NULL;
+	//If the constraint block reservation didn't fail then reserve the friction buffer too.
+	if(frictionPatchByteSize >0 && (0==constraintBlockByteSize || constraintBlock))
+	{
+		frictionPatches = reinterpret_cast<FrictionPatch*>(constraintAllocator.reserveFrictionData(frictionPatchByteSize));
+
+		if(0==frictionPatches || (reinterpret_cast<FrictionPatch*>(-1))==frictionPatches)
+		{
+			if(0==frictionPatches)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of friction data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				frictionPatches=NULL;
+			}
+		}
+	}
+
+	_frictionPatches = frictionPatches;
+
+	//Patch up the individual ptrs to the buffer returned by the constraint block reservation (assuming the reservation didn't fail).
+	if(0==constraintBlockByteSize || constraintBlock)
+	{
+		if(solverConstraintByteSize)
+		{
+			solverConstraint = constraintBlock;
+			PX_ASSERT(0==(uintptr_t(solverConstraint) & 0x0f));
+		}
+	}
+
+	//Return true if neither of the two block reservations failed.
+	return ((0==constraintBlockByteSize || constraintBlock) && (0==frictionPatchByteSize || frictionPatches));
+}
+
+
+bool createFinalizeSolverContacts(
+	PxSolverContactDesc& contactDesc,
+	CorrelationBuffer& c,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxConstraintAllocator& constraintAllocator)
+{
+	Ps::prefetchLine(contactDesc.body0);
+	Ps::prefetchLine(contactDesc.body1);
+	Ps::prefetchLine(contactDesc.data0);
+	Ps::prefetchLine(contactDesc.data1);
+
+	c.frictionPatchCount = 0;
+	c.contactPatchCount = 0;
+
+	const bool hasForceThreshold = contactDesc.hasForceThresholds;
+	const bool staticOrKinematicBody = contactDesc.bodyState1 == PxSolverContactDesc::eKINEMATIC_BODY || contactDesc.bodyState1 == PxSolverContactDesc::eSTATIC_BODY;
+
+	const bool disableStrongFriction = contactDesc.disableStrongFriction;
+	const bool useExtContacts = ((contactDesc.bodyState0 | contactDesc.bodyState1) & PxSolverContactDesc::eARTICULATION) != 0;
+
+	PxSolverConstraintDesc& desc = *contactDesc.desc;
+
+	desc.constraintLengthOver16 = 0;
+
+
+	if (contactDesc.numContacts == 0)
+	{
+		contactDesc.frictionPtr = NULL;
+		contactDesc.frictionCount = 0;
+		desc.constraint = NULL;
+		return true;
+	}
+
+	if (!disableStrongFriction)
+	{
+		getFrictionPatches(c, contactDesc.frictionPtr, contactDesc.frictionCount, contactDesc.bodyFrame0, contactDesc.bodyFrame1, correlationDistance);
+	}
+
+	bool overflow = !createContactPatches(c, contactDesc.contacts, contactDesc.numContacts, PXC_SAME_NORMAL);
+	overflow = correlatePatches(c, contactDesc.contacts, contactDesc.bodyFrame0, contactDesc.bodyFrame1, PXC_SAME_NORMAL, 0, 0) || overflow;
+	PX_UNUSED(overflow);
+
+#if PX_CHECKED
+	if (overflow)
+	{
+		Ps::getFoundation().error(physx::PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__,
+			"Dropping contacts in solver because we exceeded limit of 32 friction patches.");
+	}
+#endif
+
+	growPatches(c, contactDesc.contacts, contactDesc.bodyFrame0, contactDesc.bodyFrame1, correlationDistance, 0, frictionOffsetThreshold + contactDesc.restDistance);
+
+	//PX_ASSERT(patchCount == c.frictionPatchCount);
+
+	FrictionPatch* frictionPatches = NULL;
+	PxU8* solverConstraint = NULL;
+	PxU32 numFrictionPatches = 0;
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 axisConstraintCount = 0;
+
+	const bool successfulReserve = reserveBlockStreams(
+		useExtContacts, c,
+		solverConstraint, frictionPatches,
+		numFrictionPatches,
+		solverConstraintByteSize,
+		axisConstraintCount,
+		constraintAllocator);
+	// initialise the work unit's ptrs to the various buffers.
+
+	contactDesc.frictionPtr = NULL;
+	contactDesc.frictionCount = 0;
+	desc.constraint = NULL;
+	desc.constraintLengthOver16 = 0;
+	// patch up the work unit with the reserved buffers and set the reserved buffer data as appropriate.
+
+	if (successfulReserve)
+	{
+		PxU8* frictionDataPtr = reinterpret_cast<PxU8*>(frictionPatches);
+		contactDesc.frictionPtr = frictionDataPtr;
+		desc.constraint = solverConstraint;
+		//output.nbContacts = Ps::to8(numContacts);
+		contactDesc.frictionCount = Ps::to8(numFrictionPatches);
+		desc.constraintLengthOver16 = Ps::to16(solverConstraintByteSize / 16);
+		desc.writeBack = contactDesc.contactForces;
+		desc.writeBackLengthOver4 = PxU16(contactDesc.contactForces ? contactDesc.numContacts : 0);
+
+		//Initialise friction buffer.
+		if (frictionPatches)
+		{
+			// PT: TODO: revisit this... not very satisfying
+			//const PxU32 maxSize = numFrictionPatches*sizeof(FrictionPatch);
+			Ps::prefetchLine(frictionPatches);
+			Ps::prefetchLine(frictionPatches, 128);
+			Ps::prefetchLine(frictionPatches, 256);
+
+			for (PxU32 i = 0; i<c.frictionPatchCount; i++)
+			{
+				//if(c.correlationListHeads[i]!=CorrelationBuffer::LIST_END)
+				if (c.frictionPatchContactCounts[i])
+				{
+					*frictionPatches++ = c.frictionPatches[i];
+					Ps::prefetchLine(frictionPatches, 256);
+				}
+			}
+		}
+
+		//Initialise solverConstraint buffer.
+		if (solverConstraint)
+		{
+			if (useExtContacts)
+			{
+				const PxSolverBodyData& data0 = *contactDesc.data0;
+				const PxSolverBodyData& data1 = *contactDesc.data1;
+
+				const SolverExtBody b0(reinterpret_cast<const void*>(contactDesc.body0), reinterpret_cast<const void*>(&data0), desc.linkIndexA);
+				const SolverExtBody b1(reinterpret_cast<const void*>(contactDesc.body1), reinterpret_cast<const void*>(&data1), desc.linkIndexB);
+
+				setupFinalizeExtSolverContacts(contactDesc.contacts, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					b0, b1, invDtF32, bounceThresholdF32,
+					contactDesc.mInvMassScales.linear0, contactDesc.mInvMassScales.angular0, contactDesc.mInvMassScales.linear1, contactDesc.mInvMassScales.angular1, 
+					contactDesc.restDistance, frictionDataPtr, contactDesc.maxCCDSeparation);
+			}
+			else
+			{
+				const PxSolverBodyData& data0 = *contactDesc.data0;
+				const PxSolverBodyData& data1 = *contactDesc.data1;
+				setupFinalizeSolverConstraints(contactDesc.shapeInteraction, contactDesc.contacts, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					data0, data1, invDtF32, bounceThresholdF32,
+					contactDesc.mInvMassScales.linear0, contactDesc.mInvMassScales.angular0, contactDesc.mInvMassScales.linear1, contactDesc.mInvMassScales.angular1, 
+					hasForceThreshold, staticOrKinematicBody, contactDesc.restDistance, frictionDataPtr, contactDesc.maxCCDSeparation);
+			}
+			//KS - set to 0 so we have a counter for the number of times we solved the constraint
+			//only going to be used on SPU but might as well set on all platforms because this code is shared
+			*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize)) = 0;
+		}
+	}
+
+	return successfulReserve;
+}
+
+
+
+bool createFinalizeSolverContacts(PxSolverContactDesc& contactDesc,
+								  PxsContactManagerOutput& output,
+								 ThreadContext& threadContext,
+								 const PxReal invDtF32,
+								 PxReal bounceThresholdF32,
+								 PxReal frictionOffsetThreshold,
+								 PxReal correlationDistance,
+								 PxConstraintAllocator& constraintAllocator)
+{
+	ContactBuffer& buffer = threadContext.mContactBuffer;
+
+	
+
+	buffer.count = 0;
+
+	// We pull the friction patches out of the cache to remove the dependency on how
+	// the cache is organized. Remember original addrs so we can write them back 
+	// efficiently.
+
+	PxU32 numContacts = 0;
+	{
+		PxReal invMassScale0 = 1.f;
+		PxReal invMassScale1 = 1.f;
+		PxReal invInertiaScale0 = 1.f;
+		PxReal invInertiaScale1 = 1.f;
+
+		bool hasMaxImpulse = false, hasTargetVelocity = false;
+
+		numContacts = extractContacts(buffer, output, hasMaxImpulse, hasTargetVelocity, invMassScale0, invMassScale1,
+			invInertiaScale0, invInertiaScale1, PxMin(contactDesc.data0->maxContactImpulse, contactDesc.data1->maxContactImpulse));
+
+		contactDesc.contacts = buffer.contacts;
+		contactDesc.numContacts = numContacts;
+		contactDesc.disableStrongFriction = contactDesc.disableStrongFriction || hasTargetVelocity;
+		contactDesc.hasMaxImpulse = hasMaxImpulse;
+		contactDesc.mInvMassScales.linear0 *= invMassScale0;
+		contactDesc.mInvMassScales.linear1 *= invMassScale1;
+		contactDesc.mInvMassScales.angular0 *= invInertiaScale0;
+		contactDesc.mInvMassScales.angular1 *= invInertiaScale1;
+	}
+	
+	CorrelationBuffer& c = threadContext.mCorrelationBuffer;
+
+	return createFinalizeSolverContacts(contactDesc, c,	invDtF32, bounceThresholdF32, frictionOffsetThreshold, correlationDistance, constraintAllocator);
+}
+  
+PxU32 getContactManagerConstraintDesc(const PxsContactManagerOutput& cmOutput, const PxsContactManager& /*cm*/, PxSolverConstraintDesc& desc)
+{
+	desc.writeBackLengthOver4 = cmOutput.nbContacts;
+	desc.writeBack = cmOutput.contactForces;
+	return cmOutput.nbContacts;// cm.getWorkUnit().axisConstraintCount;
+}
+
+}
+
+}
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep.h
new file mode 100644
index 00000000..2e4a7ba2
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep.h
@@ -0,0 +1,168 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_CONTACTPREP_H
+#define DY_CONTACTPREP_H
+
+#include "DySolverConstraintDesc.h"
+#include "PxSceneDesc.h"
+#include "DySolverContact4.h"
+
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+class PxsConstraintBlockManager;
+struct PxsContactManagerOutput;
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+
+namespace Dy
+{
+	class ThreadContext;
+	struct CorrelationBuffer;
+
+#define CREATE_FINALIZE_SOLVER_CONTACT_METHOD_ARGS			\
+	PxSolverContactDesc& contactDesc,						\
+	PxsContactManagerOutput& output,						\
+	ThreadContext& threadContext,							\
+	const PxReal invDtF32,									\
+	PxReal bounceThresholdF32,								\
+	PxReal frictionOffsetThreshold,							\
+	PxReal	correlationDistance,							\
+	PxConstraintAllocator& constraintAllocator				
+
+#define CREATE_FINALIZE_SOVLER_CONTACT_METHOD_ARGS_4									\
+								 PxsContactManagerOutput** outputs,						\
+								 ThreadContext& threadContext,							\
+								 PxSolverContactDesc* blockDescs,						\
+								 const PxReal invDtF32,									\
+								 PxReal bounceThresholdF32,								\
+								 PxReal	frictionThresholdF32,							\
+								 PxReal	correlationDistanceF32,							\
+								 PxConstraintAllocator& constraintAllocator				
+
+	
+/*!
+Method prototype for create finalize solver contact
+*/
+
+typedef	bool (*PxcCreateFinalizeSolverContactMethod)(CREATE_FINALIZE_SOLVER_CONTACT_METHOD_ARGS);
+
+extern PxcCreateFinalizeSolverContactMethod createFinalizeMethods[3];
+
+typedef	SolverConstraintPrepState::Enum (*PxcCreateFinalizeSolverContactMethod4)(CREATE_FINALIZE_SOVLER_CONTACT_METHOD_ARGS_4);
+
+extern PxcCreateFinalizeSolverContactMethod4 createFinalizeMethods4[3];
+
+
+bool createFinalizeSolverContacts(	PxSolverContactDesc& contactDesc,
+									PxsContactManagerOutput& output,
+									ThreadContext& threadContext,
+									const PxReal invDtF32,
+									PxReal bounceThresholdF32,
+									PxReal frictionOffsetThreshold,
+									PxReal correlationDistance,
+									PxConstraintAllocator& constraintAllocator);
+
+bool createFinalizeSolverContacts(	PxSolverContactDesc& contactDesc,
+									CorrelationBuffer& c,
+									const PxReal invDtF32,
+									PxReal bounceThresholdF32,
+									PxReal frictionOffsetThreshold,
+									PxReal correlationDistance,
+									PxConstraintAllocator& constraintAllocator);
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4(	PxsContactManagerOutput** outputs,
+																 ThreadContext& threadContext,
+																 PxSolverContactDesc* blockDescs,
+																 const PxReal invDtF32,
+																 PxReal bounceThresholdF32,
+																 PxReal frictionOffsetThreshold,
+																 PxReal correlationDistance,
+																 PxConstraintAllocator& constraintAllocator);
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4(	Dy::CorrelationBuffer& c,
+																PxSolverContactDesc* blockDescs,
+																const PxReal invDtF32,
+																PxReal bounceThresholdF32,
+																PxReal	frictionOffsetThreshold,
+																PxReal correlationDistance,
+																PxConstraintAllocator& constraintAllocator);
+
+
+
+bool createFinalizeSolverContactsCoulomb1D(PxSolverContactDesc& contactDesc,
+											 PxsContactManagerOutput& output,
+											 ThreadContext& threadContext,
+											 const PxReal invDtF32,
+											 PxReal bounceThresholdF32,
+											 PxReal frictionOffsetThreshold,
+											 PxReal correlationDistance,
+											 PxConstraintAllocator& constraintAllocator);
+
+bool createFinalizeSolverContactsCoulomb2D(PxSolverContactDesc& contactDesc,
+											PxsContactManagerOutput& output,
+											ThreadContext& threadContext,
+											const PxReal invDtF32,
+											PxReal bounceThresholdF32,
+											PxReal frictionOffsetThreshold,
+											PxReal correlationDistance,
+											PxConstraintAllocator& constraintAllocator);
+
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb1D(	PxsContactManagerOutput** outputs,
+																		ThreadContext& threadContext,
+																		 PxSolverContactDesc* blockDescs,
+																		 const PxReal invDtF32,
+																		 PxReal bounceThresholdF32,
+																		 PxReal frictionOffsetThreshold,
+																		 PxReal correlationDistance,
+																		 PxConstraintAllocator& constraintAllocator);
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb2D(PxsContactManagerOutput** outputs,
+																		ThreadContext& threadContext,
+																		PxSolverContactDesc* blockDescs,
+																		const PxReal invDtF32,
+																		PxReal bounceThresholdF32,
+																		PxReal frictionOffsetThreshold,
+																		PxReal correlationDistance,
+																		PxConstraintAllocator& constraintAllocator);
+
+
+PxU32 getContactManagerConstraintDesc(const PxsContactManagerOutput& cmOutput, const PxsContactManager& cm, PxSolverConstraintDesc& desc);
+
+}
+
+}
+
+#endif //DY_CONTACTPREP_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep4.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep4.cpp
new file mode 100644
index 00000000..5bbf9637
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep4.cpp
@@ -0,0 +1,1478 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+     
+#include "foundation/PxPreprocessor.h"
+#include "PxSceneDesc.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DySolverContact.h"
+#include "DySolverContact4.h"
+#include "DySolverConstraintTypes.h"
+#include "PxcNpWorkUnit.h"
+#include "DyThreadContext.h"
+#include "DyContactPrep.h"
+#include "PxcNpContactPrepShared.h"
+#include "PxvDynamics.h"
+#include "DyCorrelationBuffer.h"
+#include "DyDynamics.h"
+#include "DyArticulationContactPrep.h"
+#include "PxsContactManager.h"
+
+#include "PsFoundation.h"
+
+using namespace physx;
+using namespace Gu;
+
+
+#include "PsVecMath.h"
+#include "PxContactModifyCallback.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DyContactPrepShared.h"
+
+using namespace Ps::aos;
+
+namespace physx
+{
+namespace Dy
+{
+
+PxcCreateFinalizeSolverContactMethod4 createFinalizeMethods4[3] = 
+{
+	createFinalizeSolverContacts4,
+	createFinalizeSolverContacts4Coulomb1D,
+	createFinalizeSolverContacts4Coulomb2D
+};
+
+inline bool ValidateVec4(const Vec4V v)
+{
+	PX_ALIGN(16, PxVec4 vF);
+	Ps::aos::V4StoreA(v, &vF.x);
+	return vF.isFinite();
+}
+
+static void setupFinalizeSolverConstraints4(PxSolverContactDesc* PX_RESTRICT descs, CorrelationBuffer& c, PxU8* PX_RESTRICT workspace,
+											const PxReal invDtF32, PxReal bounceThresholdF32,
+											const Ps::aos::Vec4VArg invMassScale0, const Ps::aos::Vec4VArg invInertiaScale0, 
+											const Ps::aos::Vec4VArg invMassScale1, const Ps::aos::Vec4VArg invInertiaScale1)
+{
+
+	//OK, we have a workspace of pre-allocated space to store all 4 descs in. We now need to create the constraints in it
+
+	const Vec4V ccdMaxSeparation = Ps::aos::V4LoadXYZW(descs[0].maxCCDSeparation, descs[1].maxCCDSeparation, descs[2].maxCCDSeparation, descs[3].maxCCDSeparation);
+
+	const Vec4V zero = V4Zero();
+	const BoolV bFalse = BFFFF();
+	const FloatV fZero = FZero();
+
+	PxU8 flags[4] = {	PxU8(descs[0].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
+						PxU8(descs[1].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
+						PxU8(descs[2].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
+						PxU8(descs[3].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0) };
+
+	bool hasMaxImpulse = descs[0].hasMaxImpulse || descs[1].hasMaxImpulse || descs[2].hasMaxImpulse || descs[3].hasMaxImpulse;
+
+	//The block is dynamic if **any** of the constraints have a non-static body B. This allows us to batch static and non-static constraints but we only get a memory/perf
+	//saving if all 4 are static. This simplifies the constraint partitioning such that it only needs to care about separating contacts and 1D constraints (which it already does)
+	bool isDynamic = false;
+	bool hasKinematic = false;
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		isDynamic = isDynamic || (descs[a].bodyState1 == PxSolverContactDesc::eDYNAMIC_BODY);
+		hasKinematic = hasKinematic || descs[a].bodyState1 == PxSolverContactDesc::eKINEMATIC_BODY;
+	}
+	
+	const PxU32 constraintSize = isDynamic ? sizeof(SolverContactBatchPointDynamic4) : sizeof(SolverContactBatchPointBase4);
+	const PxU32 frictionSize = isDynamic ? sizeof(SolverContactFrictionDynamic4) : sizeof(SolverContactFrictionBase4);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	const Vec4V dom0 = invMassScale0;
+	const Vec4V dom1 = invMassScale1;
+	const Vec4V angDom0 = invInertiaScale0;
+	const Vec4V angDom1 = invInertiaScale1;
+
+	const Vec4V maxPenBias = V4Max(V4LoadXYZW(descs[0].data0->penBiasClamp, descs[1].data0->penBiasClamp, 
+		descs[2].data0->penBiasClamp, descs[3].data0->penBiasClamp), 
+		V4LoadXYZW(descs[0].data1->penBiasClamp, descs[1].data1->penBiasClamp, 
+		descs[2].data1->penBiasClamp, descs[3].data1->penBiasClamp));
+
+	const Vec4V restDistance = V4LoadXYZW(descs[0].restDistance, descs[1].restDistance, descs[2].restDistance,
+		descs[3].restDistance); 
+
+
+	//load up velocities
+	Vec4V linVel00 = V4LoadA(&descs[0].data0->linearVelocity.x);
+	Vec4V linVel10 = V4LoadA(&descs[1].data0->linearVelocity.x);
+	Vec4V linVel20 = V4LoadA(&descs[2].data0->linearVelocity.x);
+	Vec4V linVel30 = V4LoadA(&descs[3].data0->linearVelocity.x);
+
+	Vec4V linVel01 = V4LoadA(&descs[0].data1->linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&descs[1].data1->linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&descs[2].data1->linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&descs[3].data1->linearVelocity.x);
+
+	Vec4V angVel00 = V4LoadA(&descs[0].data0->angularVelocity.x);
+	Vec4V angVel10 = V4LoadA(&descs[1].data0->angularVelocity.x);
+	Vec4V angVel20 = V4LoadA(&descs[2].data0->angularVelocity.x);
+	Vec4V angVel30 = V4LoadA(&descs[3].data0->angularVelocity.x);
+
+	Vec4V angVel01 = V4LoadA(&descs[0].data1->angularVelocity.x);
+	Vec4V angVel11 = V4LoadA(&descs[1].data1->angularVelocity.x);
+	Vec4V angVel21 = V4LoadA(&descs[2].data1->angularVelocity.x);
+	Vec4V angVel31 = V4LoadA(&descs[3].data1->angularVelocity.x);
+
+	Vec4V linVelT00, linVelT10, linVelT20;
+	Vec4V linVelT01, linVelT11, linVelT21;
+	Vec4V angVelT00, angVelT10, angVelT20;
+	Vec4V angVelT01, angVelT11, angVelT21;
+
+	PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVelT00, linVelT10, linVelT20);
+	PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVelT01, linVelT11, linVelT21);
+	PX_TRANSPOSE_44_34(angVel00, angVel10, angVel20, angVel30, angVelT00, angVelT10, angVelT20);
+	PX_TRANSPOSE_44_34(angVel01, angVel11, angVel21, angVel31, angVelT01, angVelT11, angVelT21);
+
+	const Vec4V vrelX = V4Sub(linVelT00, linVelT01);
+	const Vec4V vrelY = V4Sub(linVelT10, linVelT11);
+	const Vec4V vrelZ = V4Sub(linVelT20, linVelT21);
+
+	//Load up masses and invInertia
+
+	/*const Vec4V sqrtInvMass0 = V4Merge(FLoad(descs[0].data0->sqrtInvMass), FLoad(descs[1].data0->sqrtInvMass), FLoad(descs[2].data0->sqrtInvMass),
+		FLoad(descs[3].data0->sqrtInvMass));
+
+	const Vec4V sqrtInvMass1 = V4Merge(FLoad(descs[0].data1->sqrtInvMass), FLoad(descs[1].data1->sqrtInvMass), FLoad(descs[2].data1->sqrtInvMass),
+		FLoad(descs[3].data1->sqrtInvMass));*/
+
+	const Vec4V invMass0 = V4LoadXYZW(descs[0].data0->invMass, descs[1].data0->invMass, descs[2].data0->invMass, descs[3].data0->invMass);
+	const Vec4V invMass1 = V4LoadXYZW(descs[0].data1->invMass, descs[1].data1->invMass, descs[2].data1->invMass, descs[3].data1->invMass);
+
+	const Vec4V invMass0D0 = V4Mul(dom0, invMass0);
+	const Vec4V invMass1D1 = V4Mul(dom1, invMass1);
+
+	Vec4V invInertia00X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].data0->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia00Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].data0->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia00Z = Vec4V_From_Vec3V(V3LoadU(descs[0].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia10X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].data0->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia10Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].data0->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia10Z = Vec4V_From_Vec3V(V3LoadU(descs[1].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia20X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].data0->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia20Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].data0->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia20Z = Vec4V_From_Vec3V(V3LoadU(descs[2].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia30X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].data0->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia30Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].data0->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia30Z = Vec4V_From_Vec3V(V3LoadU(descs[3].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia01X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].data1->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia01Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].data1->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia01Z = Vec4V_From_Vec3V(V3LoadU(descs[0].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia11X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].data1->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia11Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].data1->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia11Z = Vec4V_From_Vec3V(V3LoadU(descs[1].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia21X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].data1->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia21Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].data1->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia21Z = Vec4V_From_Vec3V(V3LoadU(descs[2].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia31X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].data1->sqrtInvInertia.column0));	// PT: safe because 'column1' follows 'column0' in PxMat33
+	Vec4V invInertia31Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].data1->sqrtInvInertia.column1));	// PT: safe because 'column2' follows 'column1' in PxMat33
+	Vec4V invInertia31Z = Vec4V_From_Vec3V(V3LoadU(descs[3].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia0X0, invInertia0X1, invInertia0X2;
+	Vec4V invInertia0Y0, invInertia0Y1, invInertia0Y2;
+	Vec4V invInertia0Z0, invInertia0Z1, invInertia0Z2;
+
+	Vec4V invInertia1X0, invInertia1X1, invInertia1X2;
+	Vec4V invInertia1Y0, invInertia1Y1, invInertia1Y2;
+	Vec4V invInertia1Z0, invInertia1Z1, invInertia1Z2;
+
+	PX_TRANSPOSE_44_34(invInertia00X, invInertia10X, invInertia20X, invInertia30X, invInertia0X0, invInertia0Y0, invInertia0Z0);
+	PX_TRANSPOSE_44_34(invInertia00Y, invInertia10Y, invInertia20Y, invInertia30Y, invInertia0X1, invInertia0Y1, invInertia0Z1);
+	PX_TRANSPOSE_44_34(invInertia00Z, invInertia10Z, invInertia20Z, invInertia30Z, invInertia0X2, invInertia0Y2, invInertia0Z2);
+
+	PX_TRANSPOSE_44_34(invInertia01X, invInertia11X, invInertia21X, invInertia31X, invInertia1X0, invInertia1Y0, invInertia1Z0);
+	PX_TRANSPOSE_44_34(invInertia01Y, invInertia11Y, invInertia21Y, invInertia31Y, invInertia1X1, invInertia1Y1, invInertia1Z1);
+	PX_TRANSPOSE_44_34(invInertia01Z, invInertia11Z, invInertia21Z, invInertia31Z, invInertia1X2, invInertia1Y2, invInertia1Z2);
+
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	const Vec4V p84 = V4Splat(p8);
+	const Vec4V bounceThreshold = V4Splat(FLoad(bounceThresholdF32));
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+	const Vec3V bodyFrame00p = V3LoadU(descs[0].bodyFrame0.p);
+	const Vec3V bodyFrame01p = V3LoadU(descs[1].bodyFrame0.p);
+	const Vec3V bodyFrame02p = V3LoadU(descs[2].bodyFrame0.p);
+	const Vec3V bodyFrame03p = V3LoadU(descs[3].bodyFrame0.p);
+
+	Vec4V bodyFrame00p4 = Vec4V_From_Vec3V(bodyFrame00p);
+	Vec4V bodyFrame01p4 = Vec4V_From_Vec3V(bodyFrame01p);
+	Vec4V bodyFrame02p4 = Vec4V_From_Vec3V(bodyFrame02p);
+	Vec4V bodyFrame03p4 = Vec4V_From_Vec3V(bodyFrame03p);
+
+	Vec4V bodyFrame0pX, bodyFrame0pY, bodyFrame0pZ;
+	PX_TRANSPOSE_44_34(bodyFrame00p4, bodyFrame01p4, bodyFrame02p4, bodyFrame03p4, bodyFrame0pX, bodyFrame0pY, bodyFrame0pZ);
+
+	
+	const Vec3V bodyFrame10p = V3LoadU(descs[0].bodyFrame1.p);
+	const Vec3V bodyFrame11p = V3LoadU(descs[1].bodyFrame1.p);
+	const Vec3V bodyFrame12p = V3LoadU(descs[2].bodyFrame1.p);
+	const Vec3V bodyFrame13p = V3LoadU(descs[3].bodyFrame1.p);
+
+	Vec4V bodyFrame10p4 = Vec4V_From_Vec3V(bodyFrame10p);
+	Vec4V bodyFrame11p4 = Vec4V_From_Vec3V(bodyFrame11p);
+	Vec4V bodyFrame12p4 = Vec4V_From_Vec3V(bodyFrame12p);
+	Vec4V bodyFrame13p4 = Vec4V_From_Vec3V(bodyFrame13p);
+
+	Vec4V bodyFrame1pX, bodyFrame1pY, bodyFrame1pZ;
+	PX_TRANSPOSE_44_34(bodyFrame10p4, bodyFrame11p4, bodyFrame12p4, bodyFrame13p4, bodyFrame1pX, bodyFrame1pY, bodyFrame1pZ);
+
+
+	const QuatV bodyFrame00q = QuatVLoadU(&descs[0].bodyFrame0.q.x);	
+	const QuatV bodyFrame01q = QuatVLoadU(&descs[1].bodyFrame0.q.x);
+	const QuatV bodyFrame02q = QuatVLoadU(&descs[2].bodyFrame0.q.x);
+	const QuatV bodyFrame03q = QuatVLoadU(&descs[3].bodyFrame0.q.x);
+
+	const QuatV bodyFrame10q = QuatVLoadU(&descs[0].bodyFrame1.q.x);	
+	const QuatV bodyFrame11q = QuatVLoadU(&descs[1].bodyFrame1.q.x);
+	const QuatV bodyFrame12q = QuatVLoadU(&descs[2].bodyFrame1.q.x);	
+	const QuatV bodyFrame13q = QuatVLoadU(&descs[3].bodyFrame1.q.x);
+
+	PxU32 frictionPatchWritebackAddrIndex0 = 0;
+	PxU32 frictionPatchWritebackAddrIndex1 = 0;
+	PxU32 frictionPatchWritebackAddrIndex2 = 0;
+	PxU32 frictionPatchWritebackAddrIndex3 = 0;
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	PxU32 frictionIndex0 = 0, frictionIndex1 = 0, frictionIndex2 = 0, frictionIndex3 = 0;
+	//PxU32 contactIndex0 = 0, contactIndex1 = 0, contactIndex2 = 0, contactIndex3 = 0;
+
+
+	//OK, we iterate through all friction patch counts in the constraint patch, building up the constraint list etc.
+
+	PxU32 maxPatches = PxMax(descs[0].numFrictionPatches, PxMax(descs[1].numFrictionPatches, PxMax(descs[2].numFrictionPatches, descs[3].numFrictionPatches)));
+
+	const Vec4V p1 = V4Splat(FLoad(0.1f));
+	const Vec4V orthoThreshold = V4Splat(FLoad(0.70710678f));
+
+	
+	PxU32 contact0 = 0, contact1 = 0, contact2 = 0, contact3 = 0;
+	PxU32 patch0 = 0, patch1 = 0, patch2 = 0, patch3 = 0;
+
+	PxU8 flag = 0;
+	if(hasMaxImpulse)
+		flag |= SolverContactHeader4::eHAS_MAX_IMPULSE;
+
+	for(PxU32 i=0;i<maxPatches;i++)
+	{
+		const bool hasFinished0 = i >= descs[0].numFrictionPatches;
+		const bool hasFinished1 = i >= descs[1].numFrictionPatches;
+		const bool hasFinished2 = i >= descs[2].numFrictionPatches;
+		const bool hasFinished3 = i >= descs[3].numFrictionPatches;
+
+
+		frictionIndex0 = hasFinished0 ? frictionIndex0 : descs[0].startFrictionPatchIndex + i;
+		frictionIndex1 = hasFinished1 ? frictionIndex1 : descs[1].startFrictionPatchIndex + i;
+		frictionIndex2 = hasFinished2 ? frictionIndex2 : descs[2].startFrictionPatchIndex + i;
+		frictionIndex3 = hasFinished3 ? frictionIndex3 : descs[3].startFrictionPatchIndex + i;
+
+		PxU32 clampedContacts0 = hasFinished0 ? 0 : c.frictionPatchContactCounts[frictionIndex0];
+		PxU32 clampedContacts1 = hasFinished1 ? 0 : c.frictionPatchContactCounts[frictionIndex1];
+		PxU32 clampedContacts2 = hasFinished2 ? 0 : c.frictionPatchContactCounts[frictionIndex2];
+		PxU32 clampedContacts3 = hasFinished3 ? 0 : c.frictionPatchContactCounts[frictionIndex3];
+
+		PxU32 firstPatch0 = c.correlationListHeads[frictionIndex0];
+		PxU32 firstPatch1 = c.correlationListHeads[frictionIndex1];
+		PxU32 firstPatch2 = c.correlationListHeads[frictionIndex2];
+		PxU32 firstPatch3 = c.correlationListHeads[frictionIndex3];
+
+		const Gu::ContactPoint* contactBase0 = descs[0].contacts + c.contactPatches[firstPatch0].start;
+		const Gu::ContactPoint* contactBase1 = descs[1].contacts + c.contactPatches[firstPatch1].start;
+		const Gu::ContactPoint* contactBase2 = descs[2].contacts + c.contactPatches[firstPatch2].start;
+		const Gu::ContactPoint* contactBase3 = descs[3].contacts + c.contactPatches[firstPatch3].start;
+
+		const Vec4V restitution = V4Neg(V4LoadXYZW(contactBase0->restitution, contactBase1->restitution, contactBase2->restitution,
+			contactBase3->restitution));
+
+		SolverContactHeader4* PX_RESTRICT header = reinterpret_cast<SolverContactHeader4*>(ptr);
+		ptr += sizeof(SolverContactHeader4);	
+
+		
+		header->flags[0] = flags[0];
+		header->flags[1] = flags[1];
+		header->flags[2] = flags[2];
+		header->flags[3] = flags[3];
+
+		header->flag = flag;
+
+		PxU32 totalContacts = PxMax(clampedContacts0, PxMax(clampedContacts1, PxMax(clampedContacts2, clampedContacts3)));
+
+		Vec4V* PX_RESTRICT appliedNormalForces = reinterpret_cast<Vec4V*>(ptr);
+		ptr += sizeof(Vec4V)*totalContacts;
+
+		PxMemZero(appliedNormalForces, sizeof(Vec4V) * totalContacts);
+
+		header->numNormalConstr		= Ps::to8(totalContacts);
+		header->numNormalConstr0 = Ps::to8(clampedContacts0);
+		header->numNormalConstr1 = Ps::to8(clampedContacts1);
+		header->numNormalConstr2 = Ps::to8(clampedContacts2);
+		header->numNormalConstr3 = Ps::to8(clampedContacts3);
+		//header->sqrtInvMassA = sqrtInvMass0;
+		//header->sqrtInvMassB = sqrtInvMass1;
+		header->invMass0D0 = invMass0D0;
+		header->invMass1D1 = invMass1D1;
+		header->angDom0 = angDom0;
+		header->angDom1 = angDom1;
+		header->shapeInteraction[0] = descs[0].shapeInteraction; header->shapeInteraction[1] = descs[1].shapeInteraction; 
+		header->shapeInteraction[2] = descs[2].shapeInteraction; header->shapeInteraction[3] = descs[3].shapeInteraction;
+
+		Vec4V* maxImpulse = reinterpret_cast<Vec4V*>(ptr + constraintSize * totalContacts);
+
+		header->restitution = restitution;
+
+		Vec4V normal0 = V4LoadA(&contactBase0->normal.x);
+		Vec4V normal1 = V4LoadA(&contactBase1->normal.x);
+		Vec4V normal2 = V4LoadA(&contactBase2->normal.x);
+		Vec4V normal3 = V4LoadA(&contactBase3->normal.x);
+
+		Vec4V normalX, normalY, normalZ;
+		PX_TRANSPOSE_44_34(normal0, normal1, normal2, normal3, normalX, normalY, normalZ);
+
+		PX_ASSERT(ValidateVec4(normalX));
+		PX_ASSERT(ValidateVec4(normalY));
+		PX_ASSERT(ValidateVec4(normalZ));
+
+		header->normalX = normalX;
+		header->normalY = normalY;
+		header->normalZ = normalZ;
+
+		const Vec4V norVel0 = V4MulAdd(normalZ, linVelT20, V4MulAdd(normalY, linVelT10, V4Mul(normalX, linVelT00)));
+		const Vec4V norVel1 = V4MulAdd(normalZ, linVelT21, V4MulAdd(normalY, linVelT11, V4Mul(normalX, linVelT01)));
+		const Vec4V relNorVel  = V4Sub(norVel0, norVel1);
+
+		//For all correlation heads - need to pull this out I think
+
+		//OK, we have a counter for all our patches...
+		PxU32 finished = (PxU32(hasFinished0)) | 
+						 ((PxU32(hasFinished1)) << 1) | 
+						 ((PxU32(hasFinished2)) << 2) | 
+						 ((PxU32(hasFinished3)) << 3);
+
+		CorrelationListIterator iter0(c, firstPatch0);
+		CorrelationListIterator iter1(c, firstPatch1);
+		CorrelationListIterator iter2(c, firstPatch2);
+		CorrelationListIterator iter3(c, firstPatch3);
+
+		//PxU32 contact0, contact1, contact2, contact3;
+		//PxU32 patch0, patch1, patch2, patch3;
+
+		if(!hasFinished0)
+			iter0.nextContact(patch0, contact0);
+		if(!hasFinished1)
+			iter1.nextContact(patch1, contact1);
+		if(!hasFinished2)
+			iter2.nextContact(patch2, contact2);
+		if(!hasFinished3)
+			iter3.nextContact(patch3, contact3);
+
+		PxU8* p = ptr;
+
+		PxU32 contactCount = 0;
+		PxU32 newFinished = 
+			(PxU32(hasFinished0 || !iter0.hasNextContact()))		| 
+			((PxU32(hasFinished1 || !iter1.hasNextContact())) << 1) | 
+			((PxU32(hasFinished2 || !iter2.hasNextContact())) << 2) | 
+			((PxU32(hasFinished3 || !iter3.hasNextContact())) << 3);
+
+		while(finished != 0xf)
+		{
+			finished = newFinished;
+			++contactCount;
+			Ps::prefetchLine(p, 384);
+			Ps::prefetchLine(p, 512);
+			Ps::prefetchLine(p, 640);	
+
+			SolverContactBatchPointBase4* PX_RESTRICT solverContact = reinterpret_cast<SolverContactBatchPointBase4*>(p);
+			p += constraintSize;
+
+			const Gu::ContactPoint& con0 = descs[0].contacts[c.contactPatches[patch0].start + contact0];
+			const Gu::ContactPoint& con1 = descs[1].contacts[c.contactPatches[patch1].start + contact1];
+			const Gu::ContactPoint& con2 = descs[2].contacts[c.contactPatches[patch2].start + contact2];
+			const Gu::ContactPoint& con3 = descs[3].contacts[c.contactPatches[patch3].start + contact3];
+
+			//Now we need to splice these 4 contacts into a single structure
+
+			{
+				Vec4V point0 = V4LoadA(&con0.point.x);
+				Vec4V point1 = V4LoadA(&con1.point.x);
+				Vec4V point2 = V4LoadA(&con2.point.x);
+				Vec4V point3 = V4LoadA(&con3.point.x);
+
+				Vec4V pointX, pointY, pointZ;
+				PX_TRANSPOSE_44_34(point0, point1, point2, point3, pointX, pointY, pointZ);
+
+				PX_ASSERT(ValidateVec4(pointX));
+				PX_ASSERT(ValidateVec4(pointY));
+				PX_ASSERT(ValidateVec4(pointZ));
+
+				Vec4V cTargetVel0 = V4LoadA(&con0.targetVel.x);
+				Vec4V cTargetVel1 = V4LoadA(&con1.targetVel.x);
+				Vec4V cTargetVel2 = V4LoadA(&con2.targetVel.x);
+				Vec4V cTargetVel3 = V4LoadA(&con3.targetVel.x);
+
+				Vec4V cTargetVelX, cTargetVelY, cTargetVelZ;
+				PX_TRANSPOSE_44_34(cTargetVel0, cTargetVel1, cTargetVel2, cTargetVel3, cTargetVelX, cTargetVelY, cTargetVelZ);
+
+				const Vec4V separation = V4LoadXYZW(con0.separation, con1.separation, con2.separation, con3.separation);
+
+				const Vec4V cTargetNorVel = V4MulAdd(cTargetVelX, normalX, V4MulAdd(cTargetVelY, normalY, V4Mul(cTargetVelZ, normalZ)));
+
+				const Vec4V raX = V4Sub(pointX, bodyFrame0pX);
+				const Vec4V raY = V4Sub(pointY, bodyFrame0pY);
+				const Vec4V raZ = V4Sub(pointZ, bodyFrame0pZ);
+
+				const Vec4V rbX = V4Sub(pointX, bodyFrame1pX);
+				const Vec4V rbY = V4Sub(pointY, bodyFrame1pY);
+				const Vec4V rbZ = V4Sub(pointZ, bodyFrame1pZ);
+
+				PX_ASSERT(ValidateVec4(raX));
+				PX_ASSERT(ValidateVec4(raY));
+				PX_ASSERT(ValidateVec4(raZ));
+
+				PX_ASSERT(ValidateVec4(rbX));
+				PX_ASSERT(ValidateVec4(rbY));
+				PX_ASSERT(ValidateVec4(rbZ));
+
+
+				//raXn = cross(ra, normal) which = Vec3V( a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+
+				const Vec4V raXnX = V4NegMulSub(raZ, normalY, V4Mul(raY, normalZ));
+				const Vec4V raXnY = V4NegMulSub(raX, normalZ, V4Mul(raZ, normalX));
+				const Vec4V raXnZ = V4NegMulSub(raY, normalX, V4Mul(raX, normalY));
+
+				Vec4V delAngVel0X = V4Mul(invInertia0X0, raXnX);
+				Vec4V delAngVel0Y = V4Mul(invInertia0X1, raXnX);
+				Vec4V delAngVel0Z = V4Mul(invInertia0X2, raXnX);
+
+				delAngVel0X = V4MulAdd(invInertia0Y0, raXnY, delAngVel0X);
+				delAngVel0Y = V4MulAdd(invInertia0Y1, raXnY, delAngVel0Y);
+				delAngVel0Z = V4MulAdd(invInertia0Y2, raXnY, delAngVel0Z);
+
+				delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, delAngVel0X);
+				delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, delAngVel0Y);
+				delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, delAngVel0Z);
+
+
+				PX_ASSERT(ValidateVec4(delAngVel0X));
+				PX_ASSERT(ValidateVec4(delAngVel0Y));
+				PX_ASSERT(ValidateVec4(delAngVel0Z));
+
+				const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0X, delAngVel0X, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0Z, delAngVel0Z)));
+				const Vec4V dotRaXnAngVel0 = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4Mul(raXnX, angVelT00)));
+
+				Vec4V unitResponse = V4MulAdd(invMass0D0, angDom0, dotDelAngVel0);
+				Vec4V vrel = V4Add(relNorVel, dotRaXnAngVel0);
+
+
+				//The dynamic-only parts - need to if-statement these up. A branch here shouldn't cost us too much
+				if(isDynamic)
+				{
+					SolverContactBatchPointDynamic4* PX_RESTRICT dynamicContact = static_cast<SolverContactBatchPointDynamic4*>(solverContact);
+					const Vec4V rbXnX = V4NegMulSub(rbZ, normalY, V4Mul(rbY, normalZ));
+					const Vec4V rbXnY = V4NegMulSub(rbX, normalZ, V4Mul(rbZ, normalX));
+					const Vec4V rbXnZ = V4NegMulSub(rbY, normalX, V4Mul(rbX, normalY));
+
+					Vec4V delAngVel1X = V4Mul(invInertia1X0, rbXnX);
+					Vec4V delAngVel1Y = V4Mul(invInertia1X1, rbXnX);
+					Vec4V delAngVel1Z = V4Mul(invInertia1X2, rbXnX);
+
+					delAngVel1X = V4MulAdd(invInertia1Y0, rbXnY, delAngVel1X);
+					delAngVel1Y = V4MulAdd(invInertia1Y1, rbXnY, delAngVel1Y);
+					delAngVel1Z = V4MulAdd(invInertia1Y2, rbXnY, delAngVel1Z);
+
+					delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, delAngVel1X);
+					delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, delAngVel1Y);
+					delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, delAngVel1Z);
+
+					PX_ASSERT(ValidateVec4(delAngVel1X));
+					PX_ASSERT(ValidateVec4(delAngVel1Y));
+					PX_ASSERT(ValidateVec4(delAngVel1Z));
+
+					const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1X, delAngVel1X, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1Z, delAngVel1Z)));
+					const Vec4V dotRbXnAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
+					
+					const Vec4V resp1 = V4MulAdd(dotDelAngVel1, angDom1, invMass1D1);
+
+					unitResponse = V4Add(unitResponse, resp1);
+
+					vrel = V4Sub(vrel, dotRbXnAngVel1);
+
+					//These are for dynamic-only contacts.
+					dynamicContact->rbXnX = delAngVel1X;
+					dynamicContact->rbXnY = delAngVel1Y;
+					dynamicContact->rbXnZ = delAngVel1Z;
+
+				}
+				else if(hasKinematic)
+				{
+					const Vec4V rbXnX = V4NegMulSub(rbZ, normalY, V4Mul(rbY, normalZ));
+					const Vec4V rbXnY = V4NegMulSub(rbX, normalZ, V4Mul(rbZ, normalX));
+					const Vec4V rbXnZ = V4NegMulSub(rbY, normalX, V4Mul(rbX, normalY));
+
+					const Vec4V dotRbXnAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
+
+					vrel = V4Sub(vrel, dotRbXnAngVel1);
+				}
+
+				const Vec4V velMultiplier = V4Sel(V4IsGrtr(unitResponse, zero), V4Recip(unitResponse), zero);
+
+				const Vec4V penetration = V4Sub(separation, restDistance);
+				const Vec4V penInvDtPt8 = V4Max(maxPenBias, V4Scale(penetration, invDtp8));
+				Vec4V scaledBias = V4Mul(penInvDtPt8, velMultiplier);
+
+				const Vec4V penetrationInvDt = V4Scale(penetration, invDt);
+
+				const BoolV isGreater2 = BAnd(BAnd(V4IsGrtr(zero, restitution), V4IsGrtr(bounceThreshold, vrel)),
+					V4IsGrtr(V4Neg(vrel), penetrationInvDt));
+
+				const BoolV ccdSeparationCondition = V4IsGrtrOrEq(ccdMaxSeparation, penetration);
+
+				scaledBias = V4Sel(BAnd(ccdSeparationCondition, isGreater2), zero, V4Neg(scaledBias));
+
+				const Vec4V targetVelocity = V4Sel(isGreater2, V4Mul(velMultiplier, V4Mul(vrel, restitution)), zero);
+
+				//Vec4V biasedErr = V4Sel(isGreater2, targetVelocity, scaledBias);
+				Vec4V biasedErr = V4Add(targetVelocity, scaledBias);
+
+				biasedErr = V4NegMulSub(V4Sub(vrel, cTargetNorVel), velMultiplier, biasedErr);
+
+				//These values are present for static and dynamic contacts			
+				solverContact->raXnX = delAngVel0X;
+				solverContact->raXnY = delAngVel0Y;
+				solverContact->raXnZ = delAngVel0Z;
+				solverContact->velMultiplier = velMultiplier;
+				solverContact->biasedErr = biasedErr;
+
+				//solverContact->scaledBias = V4Max(zero, scaledBias);
+				solverContact->scaledBias = V4Sel(isGreater2, scaledBias, V4Max(zero, scaledBias));
+
+				if(hasMaxImpulse)
+				{
+					maxImpulse[contactCount-1] = V4Merge(FLoad(con0.maxImpulse), FLoad(con1.maxImpulse), FLoad(con2.maxImpulse),
+						FLoad(con3.maxImpulse));					
+				}
+			}
+			if(!(finished & 0x1))
+			{
+				iter0.nextContact(patch0, contact0);
+				newFinished |= PxU32(!iter0.hasNextContact());
+			}
+
+			if(!(finished & 0x2))
+			{
+				iter1.nextContact(patch1, contact1);
+				newFinished |= (PxU32(!iter1.hasNextContact()) << 1);
+			}
+
+			if(!(finished & 0x4))
+			{
+				iter2.nextContact(patch2, contact2);
+				newFinished |= (PxU32(!iter2.hasNextContact()) << 2);
+			}
+
+			if(!(finished & 0x8))
+			{
+				iter3.nextContact(patch3, contact3);
+				newFinished |= (PxU32(!iter3.hasNextContact()) << 3);
+			}
+		}
+		ptr = p;
+		if(hasMaxImpulse)
+		{
+			ptr += sizeof(Vec4V) * totalContacts;
+		}
+
+		//OK...friction time :-)
+
+		Vec4V maxImpulseScale = V4One();
+		{
+			const Vec4V staticFriction = V4LoadXYZW(contactBase0->staticFriction, contactBase1->staticFriction,
+				contactBase2->staticFriction, contactBase3->staticFriction);
+
+			const Vec4V dynamicFriction = V4LoadXYZW(contactBase0->dynamicFriction, contactBase1->dynamicFriction,
+				contactBase2->dynamicFriction, contactBase3->dynamicFriction);			
+
+			PX_ASSERT(totalContacts == contactCount);
+			header->dynamicFriction = dynamicFriction;
+			header->staticFriction = staticFriction;
+
+			const FrictionPatch& frictionPatch0 = c.frictionPatches[frictionIndex0];
+			const FrictionPatch& frictionPatch1 = c.frictionPatches[frictionIndex1];
+			const FrictionPatch& frictionPatch2 = c.frictionPatches[frictionIndex2];
+			const FrictionPatch& frictionPatch3 = c.frictionPatches[frictionIndex3];
+
+			PxU32 anchorCount0 = frictionPatch0.anchorCount;
+			PxU32 anchorCount1 = frictionPatch1.anchorCount;
+			PxU32 anchorCount2 = frictionPatch2.anchorCount;
+			PxU32 anchorCount3 = frictionPatch3.anchorCount;
+
+			PxU32 clampedAnchorCount0 = hasFinished0 || (contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount0;
+			PxU32 clampedAnchorCount1 = hasFinished1 || (contactBase1->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount1;
+			PxU32 clampedAnchorCount2 = hasFinished2 || (contactBase2->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount2;
+			PxU32 clampedAnchorCount3 = hasFinished3 || (contactBase3->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount3;
+			
+			const PxU32 maxAnchorCount = PxMax(clampedAnchorCount0, PxMax(clampedAnchorCount1, PxMax(clampedAnchorCount2, clampedAnchorCount3)));
+
+			//if(clampedAnchorCount0 != clampedAnchorCount1 || clampedAnchorCount0 != clampedAnchorCount2 || clampedAnchorCount0 != clampedAnchorCount3)
+			//	Ps::debugBreak();
+
+
+			//const bool haveFriction = maxAnchorCount != 0;
+			header->numFrictionConstr	= Ps::to8(maxAnchorCount*2);
+			header->numFrictionConstr0 = Ps::to8(clampedAnchorCount0*2);
+			header->numFrictionConstr1 = Ps::to8(clampedAnchorCount1*2);
+			header->numFrictionConstr2 = Ps::to8(clampedAnchorCount2*2);
+			header->numFrictionConstr3 = Ps::to8(clampedAnchorCount3*2);
+		
+			//KS - TODO - extend this if needed
+			header->type = Ps::to8(isDynamic ? DY_SC_TYPE_BLOCK_RB_CONTACT : DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT);
+
+			if(maxAnchorCount)
+			{
+
+				//Allocate the shared friction data...
+
+				SolverFrictionSharedData4* PX_RESTRICT fd = reinterpret_cast<SolverFrictionSharedData4*>(ptr);
+				ptr += sizeof(SolverFrictionSharedData4);
+				PX_UNUSED(fd);
+
+				const BoolV cond =V4IsGrtr(orthoThreshold, V4Abs(normalX));
+
+				const Vec4V t0FallbackX = V4Sel(cond, zero, V4Neg(normalY));
+				const Vec4V t0FallbackY = V4Sel(cond, V4Neg(normalZ), normalX);
+				const Vec4V t0FallbackZ = V4Sel(cond, normalY, zero);
+
+				//const Vec4V dotNormalVrel = V4MulAdd(normalZ, vrelZ, V4MulAdd(normalY, vrelY, V4Mul(normalX, vrelX)));
+				const Vec4V vrelSubNorVelX = V4NegMulSub(normalX, relNorVel, vrelX);
+				const Vec4V vrelSubNorVelY = V4NegMulSub(normalY, relNorVel, vrelY);
+				const Vec4V vrelSubNorVelZ = V4NegMulSub(normalZ, relNorVel, vrelZ);
+
+				const Vec4V lenSqvrelSubNorVelZ = V4MulAdd(vrelSubNorVelX, vrelSubNorVelX, V4MulAdd(vrelSubNorVelY, vrelSubNorVelY, V4Mul(vrelSubNorVelZ, vrelSubNorVelZ)));
+
+				const BoolV bcon2 = V4IsGrtr(lenSqvrelSubNorVelZ, p1);
+
+				Vec4V t0X = V4Sel(bcon2, vrelSubNorVelX, t0FallbackX);
+				Vec4V t0Y = V4Sel(bcon2, vrelSubNorVelY, t0FallbackY);
+				Vec4V t0Z = V4Sel(bcon2, vrelSubNorVelZ, t0FallbackZ);
+
+
+				//Now normalize this...
+				const Vec4V recipLen = V4Rsqrt(V4MulAdd(t0Z, t0Z, V4MulAdd(t0Y, t0Y, V4Mul(t0X, t0X))));
+
+				t0X = V4Mul(t0X, recipLen);
+				t0Y = V4Mul(t0Y, recipLen);
+				t0Z = V4Mul(t0Z, recipLen);
+
+				Vec4V t1X = V4NegMulSub(normalZ, t0Y, V4Mul(normalY, t0Z));
+				Vec4V t1Y = V4NegMulSub(normalX, t0Z, V4Mul(normalZ, t0X));
+				Vec4V t1Z = V4NegMulSub(normalY, t0X, V4Mul(normalX, t0Y));
+
+				PX_ASSERT((uintptr_t(descs[0].frictionPtr) & 0xF) == 0);
+				PX_ASSERT((uintptr_t(descs[1].frictionPtr) & 0xF) == 0);
+				PX_ASSERT((uintptr_t(descs[2].frictionPtr) & 0xF) == 0);
+				PX_ASSERT((uintptr_t(descs[3].frictionPtr) & 0xF) == 0);
+
+
+				PxU8* PX_RESTRICT writeback0 = descs[0].frictionPtr + frictionPatchWritebackAddrIndex0*sizeof(FrictionPatch);
+				PxU8* PX_RESTRICT writeback1 = descs[1].frictionPtr + frictionPatchWritebackAddrIndex1*sizeof(FrictionPatch);
+				PxU8* PX_RESTRICT writeback2 = descs[2].frictionPtr + frictionPatchWritebackAddrIndex2*sizeof(FrictionPatch);
+				PxU8* PX_RESTRICT writeback3 = descs[3].frictionPtr + frictionPatchWritebackAddrIndex3*sizeof(FrictionPatch);
+
+				PxU32 index0 = 0, index1 = 0, index2 = 0, index3 = 0;
+
+				fd->broken = bFalse;
+				fd->frictionBrokenWritebackByte[0] = writeback0;
+				fd->frictionBrokenWritebackByte[1] = writeback1;
+				fd->frictionBrokenWritebackByte[2] = writeback2;
+				fd->frictionBrokenWritebackByte[3] = writeback3;
+
+
+				fd->normalX[0] = t0X;
+				fd->normalY[0] = t0Y;
+				fd->normalZ[0] = t0Z;
+
+				fd->normalX[1] = t1X;
+				fd->normalY[1] = t1Y;
+				fd->normalZ[1] = t1Z;
+
+				Vec4V* PX_RESTRICT appliedForces = reinterpret_cast<Vec4V*>(ptr);
+				ptr += sizeof(Vec4V)*header->numFrictionConstr;
+
+				PxMemZero(appliedForces, sizeof(Vec4V) * header->numFrictionConstr);
+
+				for(PxU32 j = 0; j < maxAnchorCount; j++)
+				{
+					Ps::prefetchLine(ptr, 384);
+					Ps::prefetchLine(ptr, 512);
+					Ps::prefetchLine(ptr, 640);
+					SolverContactFrictionBase4* PX_RESTRICT f0 = reinterpret_cast<SolverContactFrictionBase4*>(ptr);
+					ptr += frictionSize;
+					SolverContactFrictionBase4* PX_RESTRICT f1 = reinterpret_cast<SolverContactFrictionBase4*>(ptr);
+					ptr += frictionSize;
+
+					index0 = j < clampedAnchorCount0 ? j : index0;
+					index1 = j < clampedAnchorCount1 ? j : index1;
+					index2 = j < clampedAnchorCount2 ? j : index2;
+					index3 = j < clampedAnchorCount3 ? j : index3;
+
+					if(j >= clampedAnchorCount0)
+						maxImpulseScale = V4SetX(maxImpulseScale, fZero);
+					if(j >= clampedAnchorCount1)
+						maxImpulseScale = V4SetY(maxImpulseScale, fZero);
+					if(j >= clampedAnchorCount2)
+						maxImpulseScale = V4SetZ(maxImpulseScale, fZero);
+					if(j >= clampedAnchorCount3)
+						maxImpulseScale = V4SetW(maxImpulseScale, fZero);
+
+					t0X = V4Mul(maxImpulseScale, t0X);
+					t0Y = V4Mul(maxImpulseScale, t0Y);
+					t0Z = V4Mul(maxImpulseScale, t0Z);
+
+					t1X = V4Mul(maxImpulseScale, t1X);
+					t1Y = V4Mul(maxImpulseScale, t1Y);
+					t1Z = V4Mul(maxImpulseScale, t1Z);
+
+
+					Vec3V body0Anchor0 = V3LoadU(frictionPatch0.body0Anchors[index0]);
+					Vec3V body0Anchor1 = V3LoadU(frictionPatch1.body0Anchors[index1]);
+					Vec3V body0Anchor2 = V3LoadU(frictionPatch2.body0Anchors[index2]);
+					Vec3V body0Anchor3 = V3LoadU(frictionPatch3.body0Anchors[index3]);
+
+					Vec4V ra0 = Vec4V_From_Vec3V(QuatRotate(bodyFrame00q, body0Anchor0));
+					Vec4V ra1 = Vec4V_From_Vec3V(QuatRotate(bodyFrame01q, body0Anchor1));
+					Vec4V ra2 = Vec4V_From_Vec3V(QuatRotate(bodyFrame02q, body0Anchor2));
+					Vec4V ra3 = Vec4V_From_Vec3V(QuatRotate(bodyFrame03q, body0Anchor3));
+
+					Vec4V raX, raY, raZ;
+					PX_TRANSPOSE_44_34(ra0, ra1, ra2, ra3, raX, raY, raZ);
+
+					const Vec4V raWorldX = V4Add(raX, bodyFrame0pX);
+					const Vec4V raWorldY = V4Add(raY, bodyFrame0pY);
+					const Vec4V raWorldZ = V4Add(raZ, bodyFrame0pZ);
+
+					Vec3V body1Anchor0 = V3LoadU(frictionPatch0.body1Anchors[index0]);	
+					Vec3V body1Anchor1 = V3LoadU(frictionPatch1.body1Anchors[index1]);
+					Vec3V body1Anchor2 = V3LoadU(frictionPatch2.body1Anchors[index2]);
+					Vec3V body1Anchor3 = V3LoadU(frictionPatch3.body1Anchors[index3]);
+				
+					Vec4V rb0 = Vec4V_From_Vec3V(QuatRotate(bodyFrame10q, body1Anchor0));
+					Vec4V rb1 = Vec4V_From_Vec3V(QuatRotate(bodyFrame11q, body1Anchor1));
+					Vec4V rb2 = Vec4V_From_Vec3V(QuatRotate(bodyFrame12q, body1Anchor2));
+					Vec4V rb3 = Vec4V_From_Vec3V(QuatRotate(bodyFrame13q, body1Anchor3));
+
+					Vec4V rbX, rbY, rbZ;
+					PX_TRANSPOSE_44_34(rb0, rb1, rb2, rb3, rbX, rbY, rbZ);
+
+					const Vec4V rbWorldX = V4Add(rbX, bodyFrame1pX);
+					const Vec4V rbWorldY = V4Add(rbY, bodyFrame1pY);
+					const Vec4V rbWorldZ = V4Add(rbZ, bodyFrame1pZ);
+
+					const Vec4V errorX = V4Sub(raWorldX, rbWorldX);
+					const Vec4V errorY = V4Sub(raWorldY, rbWorldY);
+					const Vec4V errorZ = V4Sub(raWorldZ, rbWorldZ);
+
+					//KS - todo - get this working with per-point friction
+						//PxU32 index0 = /*perPointFriction ? c.contactID[i][j] : */c.contactPatches[c.correlationListHeads[i]].start;
+
+					Vec4V targetVel0 = V4LoadA(&contactBase0->targetVel.x);
+					Vec4V targetVel1 = V4LoadA(&contactBase1->targetVel.x);
+					Vec4V targetVel2 = V4LoadA(&contactBase2->targetVel.x);
+					Vec4V targetVel3 = V4LoadA(&contactBase3->targetVel.x);
+
+					Vec4V targetVelX, targetVelY, targetVelZ;
+					PX_TRANSPOSE_44_34(targetVel0, targetVel1, targetVel2, targetVel3, targetVelX, targetVelY, targetVelZ);
+
+					
+					{
+						const Vec4V raXnX = V4NegMulSub(raZ, t0Y, V4Mul(raY, t0Z));
+						const Vec4V raXnY = V4NegMulSub(raX, t0Z, V4Mul(raZ, t0X));
+						const Vec4V raXnZ = V4NegMulSub(raY, t0X, V4Mul(raX, t0Y));
+
+						Vec4V delAngVel0X = V4Mul(invInertia0X0, raXnX);
+						Vec4V delAngVel0Y = V4Mul(invInertia0X1, raXnX);
+						Vec4V delAngVel0Z = V4Mul(invInertia0X2, raXnX);
+
+						delAngVel0X = V4MulAdd(invInertia0Y0, raXnY, delAngVel0X);
+						delAngVel0Y = V4MulAdd(invInertia0Y1, raXnY, delAngVel0Y);
+						delAngVel0Z = V4MulAdd(invInertia0Y2, raXnY, delAngVel0Z);
+
+						delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, delAngVel0X);
+						delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, delAngVel0Y);
+						delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, delAngVel0Z);
+
+						const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0Z, delAngVel0Z, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0X, delAngVel0X)));
+					
+						Vec4V resp = V4MulAdd(dotDelAngVel0, angDom0, invMass0D0);
+
+						const Vec4V tVel0 = V4MulAdd(t0Z, linVelT20, V4MulAdd(t0Y, linVelT10, V4Mul(t0X, linVelT00)));
+						Vec4V vrel = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4MulAdd(raXnX, angVelT00, tVel0)));
+
+						if(isDynamic)
+						{
+							SolverContactFrictionDynamic4* PX_RESTRICT dynamicF0 = static_cast<SolverContactFrictionDynamic4*>(f0);
+
+							const Vec4V rbXnX = V4NegMulSub(rbZ, t0Y, V4Mul(rbY, t0Z));
+							const Vec4V rbXnY = V4NegMulSub(rbX, t0Z, V4Mul(rbZ, t0X));
+							const Vec4V rbXnZ = V4NegMulSub(rbY, t0X, V4Mul(rbX, t0Y));
+
+							Vec4V delAngVel1X = V4Mul(invInertia1X0, rbXnX);
+							Vec4V delAngVel1Y = V4Mul(invInertia1X1, rbXnX);
+							Vec4V delAngVel1Z = V4Mul(invInertia1X2, rbXnX);
+
+							delAngVel1X = V4MulAdd(invInertia1Y0, rbXnY, delAngVel1X);
+							delAngVel1Y = V4MulAdd(invInertia1Y1, rbXnY, delAngVel1Y);
+							delAngVel1Z = V4MulAdd(invInertia1Y2, rbXnY, delAngVel1Z);
+
+							delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, delAngVel1X);
+							delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, delAngVel1Y);
+							delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, delAngVel1Z);					
+						
+							const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1Z, delAngVel1Z, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1X, delAngVel1X)));
+							
+							const Vec4V resp1 = V4MulAdd(dotDelAngVel1, angDom1, invMass1D1);
+
+							resp = V4Add(resp, resp1);
+							
+							dynamicF0->rbXnX = delAngVel1X;
+							dynamicF0->rbXnY = delAngVel1Y;
+							dynamicF0->rbXnZ = delAngVel1Z;
+
+							const Vec4V tVel1 = V4MulAdd(t0Z, linVelT21, V4MulAdd(t0Y, linVelT11, V4Mul(t0X, linVelT01)));
+							const Vec4V vel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4MulAdd(rbXnX, angVelT01, tVel1)));
+
+							vrel = V4Sub(vrel, vel1);
+						}
+						else if(hasKinematic)
+						{
+							const Vec4V rbXnX = V4NegMulSub(rbZ, t0Y, V4Mul(rbY, t0Z));
+							const Vec4V rbXnY = V4NegMulSub(rbX, t0Z, V4Mul(rbZ, t0X));
+							const Vec4V rbXnZ = V4NegMulSub(rbY, t0X, V4Mul(rbX, t0Y));
+
+							const Vec4V dotRbXnAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
+
+							vrel = V4Sub(vrel, dotRbXnAngVel1);
+						}
+
+
+						const Vec4V velMultiplier = V4Mul(maxImpulseScale, V4Sel(V4IsGrtr(resp, zero), V4Div(p84, resp), zero));
+
+						Vec4V bias = V4Scale(V4MulAdd(t0Z, errorZ, V4MulAdd(t0Y, errorY, V4Mul(t0X, errorX))), invDt);
+
+						Vec4V targetVel = V4MulAdd(t0Z, targetVelZ,V4MulAdd(t0Y, targetVelY, V4Mul(t0X, targetVelX)));
+						targetVel = V4Sub(targetVel, vrel);
+						f0->targetVelocity = V4Neg(V4Mul(targetVel, velMultiplier));
+						bias = V4Sub(bias, targetVel);
+
+						f0->raXnX = delAngVel0X;
+						f0->raXnY = delAngVel0Y;
+						f0->raXnZ = delAngVel0Z;
+						f0->scaledBias = V4Mul(bias, velMultiplier);
+						f0->velMultiplier = velMultiplier;								
+					}
+
+					{
+						const Vec4V raXnX = V4NegMulSub(raZ, t1Y, V4Mul(raY, t1Z));
+						const Vec4V raXnY = V4NegMulSub(raX, t1Z, V4Mul(raZ, t1X));
+						const Vec4V raXnZ = V4NegMulSub(raY, t1X, V4Mul(raX, t1Y));
+
+						Vec4V delAngVel0X = V4Mul(invInertia0X0, raXnX);
+						Vec4V delAngVel0Y = V4Mul(invInertia0X1, raXnX);
+						Vec4V delAngVel0Z = V4Mul(invInertia0X2, raXnX);
+
+						delAngVel0X = V4MulAdd(invInertia0Y0, raXnY, delAngVel0X);
+						delAngVel0Y = V4MulAdd(invInertia0Y1, raXnY, delAngVel0Y);
+						delAngVel0Z = V4MulAdd(invInertia0Y2, raXnY, delAngVel0Z);
+
+						delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, delAngVel0X);
+						delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, delAngVel0Y);
+						delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, delAngVel0Z);
+
+						const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0Z, delAngVel0Z, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0X, delAngVel0X)));
+					
+						Vec4V resp = V4MulAdd(dotDelAngVel0, angDom0, invMass0D0);
+
+						const Vec4V tVel0 = V4MulAdd(t1Z, linVelT20, V4MulAdd(t1Y, linVelT10, V4Mul(t1X, linVelT00)));
+						Vec4V vrel = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4MulAdd(raXnX, angVelT00, tVel0)));
+
+						if(isDynamic)
+						{
+							SolverContactFrictionDynamic4* PX_RESTRICT dynamicF1 = static_cast<SolverContactFrictionDynamic4*>(f1);
+
+							const Vec4V rbXnX = V4NegMulSub(rbZ, t1Y, V4Mul(rbY, t1Z));
+							const Vec4V rbXnY = V4NegMulSub(rbX, t1Z, V4Mul(rbZ, t1X));
+							const Vec4V rbXnZ = V4NegMulSub(rbY, t1X, V4Mul(rbX, t1Y));
+
+							Vec4V delAngVel1X = V4Mul(invInertia1X0, rbXnX);
+							Vec4V delAngVel1Y = V4Mul(invInertia1X1, rbXnX);
+							Vec4V delAngVel1Z = V4Mul(invInertia1X2, rbXnX);
+
+							delAngVel1X = V4MulAdd(invInertia1Y0, rbXnY, delAngVel1X);
+							delAngVel1Y = V4MulAdd(invInertia1Y1, rbXnY, delAngVel1Y);
+							delAngVel1Z = V4MulAdd(invInertia1Y2, rbXnY, delAngVel1Z);
+
+							delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, delAngVel1X);
+							delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, delAngVel1Y);
+							delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, delAngVel1Z);					
+						
+							const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1Z, delAngVel1Z, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1X, delAngVel1X)));
+							
+							const Vec4V resp1 = V4MulAdd(dotDelAngVel1, angDom1, invMass1D1);
+
+							resp = V4Add(resp, resp1);
+							
+							dynamicF1->rbXnX = delAngVel1X;
+							dynamicF1->rbXnY = delAngVel1Y;
+							dynamicF1->rbXnZ = delAngVel1Z;
+
+							const Vec4V tVel1 = V4MulAdd(t1Z, linVelT21, V4MulAdd(t1Y, linVelT11, V4Mul(t1X, linVelT01)));
+							const Vec4V vel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4MulAdd(rbXnX, angVelT01, tVel1)));
+
+							vrel = V4Sub(vrel, vel1);
+
+						}
+						else if(hasKinematic)
+						{
+							const Vec4V rbXnX = V4NegMulSub(rbZ, t1Y, V4Mul(rbY, t1Z));
+							const Vec4V rbXnY = V4NegMulSub(rbX, t1Z, V4Mul(rbZ, t1X));
+							const Vec4V rbXnZ = V4NegMulSub(rbY, t1X, V4Mul(rbX, t1Y));
+
+							const Vec4V dotRbXnAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
+
+							vrel = V4Sub(vrel, dotRbXnAngVel1);
+						}
+
+
+						const Vec4V velMultiplier = V4Mul(maxImpulseScale, V4Sel(V4IsGrtr(resp, zero), V4Div(p84, resp), zero));
+
+						Vec4V bias = V4Scale(V4MulAdd(t1Z, errorZ, V4MulAdd(t1Y, errorY, V4Mul(t1X, errorX))), invDt);
+
+						Vec4V targetVel = V4MulAdd(t1Z, targetVelZ,V4MulAdd(t1Y, targetVelY, V4Mul(t1X, targetVelX)));
+						targetVel = V4Sub(targetVel, vrel);
+						f1->targetVelocity = V4Neg(V4Mul(targetVel, velMultiplier));
+						bias = V4Sub(bias, targetVel);
+						f1->raXnX = delAngVel0X;
+						f1->raXnY = delAngVel0Y;
+						f1->raXnZ = delAngVel0Z;
+						f1->scaledBias = V4Mul(bias, velMultiplier);
+						f1->velMultiplier = velMultiplier;
+					}				
+				}
+
+				frictionPatchWritebackAddrIndex0++;
+				frictionPatchWritebackAddrIndex1++;
+				frictionPatchWritebackAddrIndex2++;
+				frictionPatchWritebackAddrIndex3++;
+			}
+		}
+	}
+}
+
+
+
+PX_FORCE_INLINE void computeBlockStreamFrictionByteSizes(const CorrelationBuffer& c,
+														 PxU32& _frictionPatchByteSize, PxU32& _numFrictionPatches,
+														 PxU32 frictionPatchStartIndex, PxU32 frictionPatchEndIndex)
+{
+	// PT: use local vars to remove LHS
+	PxU32 numFrictionPatches = 0;
+
+	for(PxU32 i = frictionPatchStartIndex; i < frictionPatchEndIndex; i++)
+	{
+		//Friction patches.
+		if(c.correlationListHeads[i] != CorrelationBuffer::LIST_END)
+			numFrictionPatches++;
+	}
+	PxU32 frictionPatchByteSize = numFrictionPatches*sizeof(FrictionPatch);
+
+	_numFrictionPatches = numFrictionPatches;
+
+	//16-byte alignment.
+	_frictionPatchByteSize = ((frictionPatchByteSize + 0x0f) & ~0x0f);
+	PX_ASSERT(0 == (_frictionPatchByteSize & 0x0f));
+}
+
+static bool reserveFrictionBlockStreams(const CorrelationBuffer& c, PxConstraintAllocator& constraintAllocator, PxU32 frictionPatchStartIndex, PxU32 frictionPatchEndIndex,
+						FrictionPatch*& _frictionPatches,
+						PxU32& numFrictionPatches)
+{
+
+	//From frictionPatchStream we just need to reserve a single buffer.
+	PxU32 frictionPatchByteSize = 0;
+	//Compute the sizes of all the buffers.
+
+	computeBlockStreamFrictionByteSizes(c, frictionPatchByteSize, numFrictionPatches, frictionPatchStartIndex, frictionPatchEndIndex);
+
+	FrictionPatch* frictionPatches = NULL;
+	//If the constraint block reservation didn't fail then reserve the friction buffer too.
+	if(frictionPatchByteSize > 0)
+	{
+		frictionPatches = reinterpret_cast<FrictionPatch*>(constraintAllocator.reserveFrictionData(frictionPatchByteSize));
+
+		if(0==frictionPatches || (reinterpret_cast<FrictionPatch*>(-1))==frictionPatches)
+		{
+			if(0==frictionPatches)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of friction data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				frictionPatches=NULL;
+			}
+		}
+	}
+
+	_frictionPatches = frictionPatches;
+
+	//Return true if neither of the two block reservations failed.
+	return (0==frictionPatchByteSize || frictionPatches);
+}
+
+//The persistent friction patch correlation/allocation will already have happenned as this is per-pair.
+//This function just computes the size of the combined solve data.
+void computeBlockStreamByteSizes4(PxSolverContactDesc* descs,
+								PxU32& _solverConstraintByteSize, PxU32* _axisConstraintCount,
+								const CorrelationBuffer& c)
+{
+	PX_ASSERT(0 == _solverConstraintByteSize);
+
+	PxU32 maxPatches = 0;
+	PxU32 maxFrictionPatches = 0;
+	PxU32 maxContactCount[CorrelationBuffer::MAX_FRICTION_PATCHES];
+	PxU32 maxFrictionCount[CorrelationBuffer::MAX_FRICTION_PATCHES];
+	PxMemZero(maxContactCount, sizeof(maxContactCount));
+	PxMemZero(maxFrictionCount, sizeof(maxFrictionCount));
+	bool hasMaxImpulse = false;
+
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		PxU32 axisConstraintCount = 0;
+		hasMaxImpulse = hasMaxImpulse || descs[a].hasMaxImpulse;
+		for(PxU32 i = 0; i < descs[a].numFrictionPatches; i++)
+		{
+			PxU32 ind = i + descs[a].startFrictionPatchIndex;
+
+			const FrictionPatch& frictionPatch = c.frictionPatches[ind];
+
+			const bool haveFriction = (frictionPatch.materialFlags & PxMaterialFlag::eDISABLE_FRICTION) == 0
+				&& frictionPatch.anchorCount != 0;
+			//Solver constraint data.
+			if(c.frictionPatchContactCounts[ind]!=0)
+			{
+				maxContactCount[i] = PxMax(c.frictionPatchContactCounts[ind], maxContactCount[i]);
+				axisConstraintCount += c.frictionPatchContactCounts[ind];
+
+				if(haveFriction)
+				{
+					const PxU32 fricCount = PxU32(c.frictionPatches[ind].anchorCount) * 2;
+					maxFrictionCount[i] = PxMax(fricCount, maxFrictionCount[i]);
+					axisConstraintCount += fricCount;
+				}
+			}
+		}
+		maxPatches = PxMax(descs[a].numFrictionPatches, maxPatches);
+		_axisConstraintCount[a] = axisConstraintCount;
+	}
+
+	for(PxU32 a = 0; a < maxPatches; ++a)
+	{
+		if(maxFrictionCount[a] > 0)
+			maxFrictionPatches++;
+	}
+
+
+	PxU32 totalContacts = 0, totalFriction = 0;
+	for(PxU32 a = 0; a < maxPatches; ++a)
+	{
+		totalContacts += maxContactCount[a];
+		totalFriction += maxFrictionCount[a];
+	}
+
+	//OK, we have a given number of friction patches, contact points and friction constraints so we can calculate how much memory we need
+
+	//Body 2 is considered static if it is either *not dynamic* or *kinematic*
+
+	bool hasDynamicBody = false;
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		hasDynamicBody = hasDynamicBody || ((descs[a].bodyState1 == PxSolverContactDesc::eDYNAMIC_BODY));
+	}
+	
+
+	const bool isStatic = !hasDynamicBody;
+
+	const PxU32 headerSize = sizeof(SolverContactHeader4) * maxPatches + sizeof(SolverFrictionSharedData4) * maxFrictionPatches;
+	PxU32 constraintSize = isStatic ? (sizeof(SolverContactBatchPointBase4) * totalContacts) + ( sizeof(SolverContactFrictionBase4) * totalFriction) : 
+		(sizeof(SolverContactBatchPointDynamic4) * totalContacts) + (sizeof(SolverContactFrictionDynamic4) * totalFriction);
+
+	//Space for the appliedForce buffer
+	constraintSize += sizeof(Vec4V)*(totalContacts+totalFriction);
+
+	//If we have max impulse, reserve a buffer for it
+	if(hasMaxImpulse)
+		constraintSize += sizeof(Ps::aos::Vec4V) * totalContacts;
+
+	_solverConstraintByteSize =  ((constraintSize + headerSize + 0x0f) & ~0x0f);
+	PX_ASSERT(0 == (_solverConstraintByteSize & 0x0f));
+}
+
+static SolverConstraintPrepState::Enum reserveBlockStreams4(PxSolverContactDesc* descs, Dy::CorrelationBuffer& c,
+						PxU8*& solverConstraint, PxU32* axisConstraintCount,
+						PxU32& solverConstraintByteSize, 
+						PxConstraintAllocator& constraintAllocator)
+{
+	PX_ASSERT(NULL == solverConstraint);
+	PX_ASSERT(0 == solverConstraintByteSize);
+
+	//Compute the sizes of all the buffers.
+	computeBlockStreamByteSizes4(descs, 
+		solverConstraintByteSize, axisConstraintCount,
+		c);
+
+	//Reserve the buffers.
+
+	//First reserve the accumulated buffer size for the constraint block.
+	PxU8* constraintBlock = NULL;
+	const PxU32 constraintBlockByteSize = solverConstraintByteSize;
+	if(constraintBlockByteSize > 0)
+	{
+		if((constraintBlockByteSize + 16u) > 16384)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		constraintBlock = constraintAllocator.reserveConstraintData(constraintBlockByteSize + 16u);
+
+		if(0==constraintBlock || (reinterpret_cast<PxU8*>(-1))==constraintBlock)
+		{
+			if(0==constraintBlock)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of contact data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				constraintBlock=NULL;
+			}
+		}
+	}
+
+	//Patch up the individual ptrs to the buffer returned by the constraint block reservation (assuming the reservation didn't fail).
+	if(0==constraintBlockByteSize || constraintBlock)
+	{
+		if(solverConstraintByteSize)
+		{
+			solverConstraint = constraintBlock;
+			PX_ASSERT(0==(uintptr_t(solverConstraint) & 0x0f));
+		}
+	}
+
+	return ((0==constraintBlockByteSize || constraintBlock)) ? SolverConstraintPrepState::eSUCCESS : SolverConstraintPrepState::eOUT_OF_MEMORY;
+}
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4(
+	Dy::CorrelationBuffer& c,
+	PxSolverContactDesc* blockDescs,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal	frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxConstraintAllocator& constraintAllocator)
+{
+
+	PX_ALIGN(16, PxReal invMassScale0[4]);
+	PX_ALIGN(16, PxReal invMassScale1[4]);
+	PX_ALIGN(16, PxReal invInertiaScale0[4]);
+	PX_ALIGN(16, PxReal invInertiaScale1[4]);
+
+	c.frictionPatchCount = 0;
+	c.contactPatchCount = 0;
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		PxSolverContactDesc& blockDesc = blockDescs[a];
+
+		invMassScale0[a] = blockDesc.mInvMassScales.linear0;
+		invMassScale1[a] = blockDesc.mInvMassScales.linear1;
+		invInertiaScale0[a] = blockDesc.mInvMassScales.angular0;
+		invInertiaScale1[a] = blockDesc.mInvMassScales.angular1;
+
+		blockDesc.startFrictionPatchIndex = c.frictionPatchCount;
+		if (!(blockDesc.disableStrongFriction))
+		{
+			bool valid = getFrictionPatches(c, blockDesc.frictionPtr, blockDesc.frictionCount,
+				blockDesc.bodyFrame0, blockDesc.bodyFrame1, correlationDistance);
+			if (!valid)
+				return SolverConstraintPrepState::eUNBATCHABLE;
+		}
+		//Create the contact patches
+		blockDesc.startContactPatchIndex = c.contactPatchCount;
+		if (!createContactPatches(c, blockDesc.contacts, blockDesc.numContacts, PXC_SAME_NORMAL))
+			return SolverConstraintPrepState::eUNBATCHABLE;
+		blockDesc.numContactPatches = PxU16(c.contactPatchCount - blockDesc.startContactPatchIndex);
+
+		bool overflow = correlatePatches(c, blockDesc.contacts, blockDesc.bodyFrame0, blockDesc.bodyFrame1, PXC_SAME_NORMAL,
+			blockDesc.startContactPatchIndex, blockDesc.startFrictionPatchIndex);
+
+		if (overflow)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		growPatches(c, blockDesc.contacts, blockDesc.bodyFrame0, blockDesc.bodyFrame1, correlationDistance, blockDesc.startFrictionPatchIndex,
+			frictionOffsetThreshold + blockDescs[a].restDistance);
+
+		//Remove the empty friction patches - do we actually need to do this?
+		for (PxU32 p = c.frictionPatchCount; p > blockDesc.startFrictionPatchIndex; --p)
+		{
+			if (c.correlationListHeads[p - 1] == 0xffff)
+			{
+				//We have an empty patch...need to bin this one...
+				for (PxU32 p2 = p; p2 < c.frictionPatchCount; ++p2)
+				{
+					c.correlationListHeads[p2 - 1] = c.correlationListHeads[p2];
+					c.frictionPatchContactCounts[p2 - 1] = c.frictionPatchContactCounts[p2];
+				}
+				c.frictionPatchCount--;
+			}
+		}
+
+		PxU32 numFricPatches = c.frictionPatchCount - blockDesc.startFrictionPatchIndex;
+		blockDesc.numFrictionPatches = numFricPatches;
+	}
+
+	FrictionPatch* frictionPatchArray[4];
+	PxU32 frictionPatchCounts[4];
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		PxSolverContactDesc& blockDesc = blockDescs[a];
+
+		const bool successfulReserve = reserveFrictionBlockStreams(c, constraintAllocator, blockDesc.startFrictionPatchIndex, blockDesc.numFrictionPatches + blockDesc.startFrictionPatchIndex,
+			frictionPatchArray[a],
+			frictionPatchCounts[a]);
+
+		//KS - TODO - how can we recover if we failed to allocate this memory?
+		if (!successfulReserve)
+		{
+			return SolverConstraintPrepState::eOUT_OF_MEMORY;
+		}
+	}
+	//At this point, all the friction data has been calculated, the correlation has been done. Provided this was all successful, 
+	//we are ready to create the batched constraints
+
+	PxU8* solverConstraint = NULL;
+	PxU32 solverConstraintByteSize = 0;
+
+
+
+	{
+		PxU32 axisConstraintCount[4];
+		SolverConstraintPrepState::Enum state = reserveBlockStreams4(blockDescs, c,
+			solverConstraint, axisConstraintCount,
+			solverConstraintByteSize,
+			constraintAllocator);
+
+		if (state != SolverConstraintPrepState::eSUCCESS)
+			return state;
+
+
+		for (PxU32 a = 0; a < 4; ++a)
+		{
+
+			FrictionPatch* frictionPatches = frictionPatchArray[a];
+
+			PxSolverContactDesc& blockDesc = blockDescs[a];
+			PxSolverConstraintDesc& desc = *blockDesc.desc;
+			blockDesc.frictionPtr = reinterpret_cast<PxU8*>(frictionPatches);
+			blockDesc.frictionCount = Ps::to8(frictionPatchCounts[a]);
+
+			//Initialise friction buffer.
+			if (frictionPatches)
+			{
+				// PT: TODO: revisit this... not very satisfying
+				//const PxU32 maxSize = numFrictionPatches*sizeof(FrictionPatch);
+				Ps::prefetchLine(frictionPatches);
+				Ps::prefetchLine(frictionPatches, 128);
+				Ps::prefetchLine(frictionPatches, 256);
+
+				for (PxU32 i = 0; i<blockDesc.numFrictionPatches; i++)
+				{
+					if (c.correlationListHeads[blockDesc.startFrictionPatchIndex + i] != CorrelationBuffer::LIST_END)
+					{
+						//*frictionPatches++ = c.frictionPatches[blockDesc.startFrictionPatchIndex + i];
+						PxMemCopy(frictionPatches++, &c.frictionPatches[blockDesc.startFrictionPatchIndex + i], sizeof(FrictionPatch));
+						//Ps::prefetchLine(frictionPatches, 256);
+					}
+				}
+			}
+
+
+			blockDesc.axisConstraintCount += Ps::to16(axisConstraintCount[a]);
+
+			desc.constraint = solverConstraint;
+			desc.constraintLengthOver16 = Ps::to16(solverConstraintByteSize / 16);
+			desc.writeBackLengthOver4 = PxU16(blockDesc.numContacts);
+			desc.writeBack = blockDesc.contactForces;
+		}
+
+		const Vec4V iMassScale0 = V4LoadA(invMassScale0);
+		const Vec4V iInertiaScale0 = V4LoadA(invInertiaScale0);
+		const Vec4V iMassScale1 = V4LoadA(invMassScale1);
+		const Vec4V iInertiaScale1 = V4LoadA(invInertiaScale1);
+
+		setupFinalizeSolverConstraints4(blockDescs, c, solverConstraint, invDtF32, bounceThresholdF32,
+			iMassScale0, iInertiaScale0, iMassScale1, iInertiaScale1);
+
+		PX_ASSERT((*solverConstraint == DY_SC_TYPE_BLOCK_RB_CONTACT) || (*solverConstraint == DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT));
+
+		*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize)) = 0;
+	}
+	return SolverConstraintPrepState::eSUCCESS;
+}
+
+
+//This returns 1 of 3 states: success, unbatchable or out-of-memory. If the constraint is unbatchable, we must fall back on 4 separate constraint
+//prep calls
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4(
+	PxsContactManagerOutput** cmOutputs,
+	ThreadContext& threadContext,
+	PxSolverContactDesc* blockDescs,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal	frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxConstraintAllocator& constraintAllocator)
+{
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		blockDescs[a].desc->constraintLengthOver16 = 0;
+	}
+
+	PX_ASSERT(cmOutputs[0]->nbContacts && cmOutputs[1]->nbContacts && cmOutputs[2]->nbContacts && cmOutputs[3]->nbContacts);
+
+
+	Gu::ContactBuffer& buffer = threadContext.mContactBuffer;
+
+	buffer.count = 0;
+
+	//PxTransform idt = PxTransform(PxIdentity);
+
+	CorrelationBuffer& c = threadContext.mCorrelationBuffer;
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		PxSolverContactDesc& blockDesc = blockDescs[a];
+		PxSolverConstraintDesc& desc = *blockDesc.desc;
+
+		//blockDesc.startContactIndex = buffer.count;
+		blockDesc.contacts = buffer.contacts + buffer.count;
+
+		Ps::prefetchLine(desc.bodyA);
+		Ps::prefetchLine(desc.bodyB);
+
+
+		if ((buffer.count + cmOutputs[a]->nbContacts) > 64)
+		{
+			return SolverConstraintPrepState::eUNBATCHABLE;
+		}
+
+		bool hasMaxImpulse = false;
+		bool hasTargetVelocity = false;
+
+		//OK...do the correlation here as well...
+		Ps::prefetchLine(blockDescs[a].frictionPtr);
+		Ps::prefetchLine(blockDescs[a].frictionPtr, 64);
+		Ps::prefetchLine(blockDescs[a].frictionPtr, 128);
+
+		if (a < 3)
+		{
+			Ps::prefetchLine(cmOutputs[a]->contactPatches);
+			Ps::prefetchLine(cmOutputs[a]->contactPoints);
+		}
+
+		PxReal invMassScale0, invMassScale1, invInertiaScale0, invInertiaScale1;
+
+		const PxReal defaultMaxImpulse = PxMin(blockDesc.data0->maxContactImpulse, blockDesc.data1->maxContactImpulse);
+
+		PxU32 contactCount = extractContacts(buffer, *cmOutputs[a], hasMaxImpulse, hasTargetVelocity, invMassScale0, invMassScale1,
+			invInertiaScale0, invInertiaScale1, defaultMaxImpulse);
+
+		if (contactCount == 0)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		blockDesc.numContacts = contactCount;
+		blockDesc.hasMaxImpulse = hasMaxImpulse;
+		blockDesc.disableStrongFriction = blockDesc.disableStrongFriction || hasTargetVelocity;
+
+		blockDesc.mInvMassScales.linear0 *= invMassScale0;
+		blockDesc.mInvMassScales.linear1 *= invMassScale1;
+		blockDesc.mInvMassScales.angular0 *= invInertiaScale0;
+		blockDesc.mInvMassScales.angular1 *= invInertiaScale1;
+
+		//blockDesc.frictionPtr = &blockDescs[a].frictionPtr;
+		//blockDesc.frictionCount = blockDescs[a].frictionCount;
+
+	}
+	return createFinalizeSolverContacts4(c, blockDescs,
+		invDtF32, bounceThresholdF32,	frictionOffsetThreshold,
+		correlationDistance, constraintAllocator);
+}
+
+
+
+
+}
+
+}
+
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep4PF.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep4PF.cpp
new file mode 100644
index 00000000..4442b433
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrep4PF.cpp
@@ -0,0 +1,1017 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+     
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DySolverConstraintTypes.h"
+#include "PxcNpWorkUnit.h"
+#include "DyThreadContext.h"
+#include "DyContactPrep.h"
+#include "PxcNpContactPrepShared.h"
+//#include "PxvGeometry.h"
+#include "PxvDynamics.h"
+#include "DyCorrelationBuffer.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverBody.h"
+#include "DySolverContact4.h"
+#include "DySolverContactPF4.h"
+
+
+#include "PsVecMath.h"
+#include "PxContactModifyCallback.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DySolverExt.h"
+#include "DyArticulationContactPrep.h"
+#include "DyContactPrepShared.h"
+#include "PsFoundation.h"
+
+using namespace physx::Gu;
+using namespace physx::shdfnd::aos;
+
+namespace physx
+{
+namespace Dy
+{
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb(
+		PxsContactManagerOutput** outputs,
+		ThreadContext& threadContext,
+		PxSolverContactDesc* blockDescs,
+		const PxReal invDtF32,
+		PxReal bounceThresholdF32,
+		PxReal frictionOffsetThreshold,
+		PxReal correlationDistance,
+		PxConstraintAllocator& constraintAllocator,
+		PxFrictionType::Enum frictionType);
+
+static bool setupFinalizeSolverConstraintsCoulomb4(PxSolverContactDesc* PX_RESTRICT descs, PxU8* PX_RESTRICT workspace, 
+											const PxReal invDtF32, PxReal bounceThresholdF32, CorrelationBuffer& c, const PxU32 numFrictionPerPoint,
+											const PxU32 numContactPoints4, const PxU32 /*solverConstraintByteSize*/,
+											const Ps::aos::Vec4VArg invMassScale0, const Ps::aos::Vec4VArg invInertiaScale0, 
+											const Ps::aos::Vec4VArg invMassScale1, const Ps::aos::Vec4VArg invInertiaScale1)
+{
+	//KS - final step. Create the constraints in the place we pre-allocated...
+
+	const Vec4V ccdMaxSeparation = Ps::aos::V4LoadXYZW(descs[0].maxCCDSeparation, descs[1].maxCCDSeparation, descs[2].maxCCDSeparation, descs[3].maxCCDSeparation);
+
+	const Vec4V zero = V4Zero();
+
+	PxU8 flags[4] = {	PxU8(descs[0].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
+						PxU8(descs[1].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
+						PxU8(descs[2].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
+						PxU8(descs[3].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0) };
+
+
+	//The block is dynamic if **any** of the constraints have a non-static body B. This allows us to batch static and non-static constraints but we only get a memory/perf
+	//saving if all 4 are static. This simplifies the constraint partitioning such that it only needs to care about separating contacts and 1D constraints (which it already does)
+	const bool isDynamic = ((descs[0].bodyState1 | descs[1].bodyState1 | descs[2].bodyState1 | descs[3].bodyState1) & PxSolverContactDesc::eDYNAMIC_BODY) != 0;
+
+	const PxU32 constraintSize = isDynamic ? sizeof(SolverContact4Dynamic) : sizeof(SolverContact4Base);
+	const PxU32 frictionSize = isDynamic ? sizeof(SolverFriction4Dynamic) : sizeof(SolverFriction4Base);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	const Vec4V dom0 = invMassScale0;
+	const Vec4V dom1 = invMassScale1;
+	const Vec4V angDom0 = invInertiaScale0;
+	const Vec4V angDom1 = invInertiaScale1;
+
+	const Vec4V maxPenBias = V4Max(V4Merge(FLoad(descs[0].data0->penBiasClamp), FLoad(descs[1].data0->penBiasClamp), 
+		FLoad(descs[2].data0->penBiasClamp), FLoad(descs[3].data0->penBiasClamp)), 
+		V4Merge(FLoad(descs[0].data1->penBiasClamp), FLoad(descs[1].data1->penBiasClamp), 
+		FLoad(descs[2].data1->penBiasClamp), FLoad(descs[3].data1->penBiasClamp)));
+
+	const Vec4V restDistance = V4Merge(FLoad(descs[0].restDistance), FLoad(descs[1].restDistance), FLoad(descs[2].restDistance),
+		FLoad(descs[3].restDistance)); 
+
+	//load up velocities
+	Vec4V linVel00 = V4LoadA(&descs[0].data0->linearVelocity.x);
+	Vec4V linVel10 = V4LoadA(&descs[1].data0->linearVelocity.x);
+	Vec4V linVel20 = V4LoadA(&descs[2].data0->linearVelocity.x);
+	Vec4V linVel30 = V4LoadA(&descs[3].data0->linearVelocity.x);
+
+	Vec4V linVel01 = V4LoadA(&descs[0].data1->linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&descs[1].data1->linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&descs[2].data1->linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&descs[3].data1->linearVelocity.x);
+
+	Vec4V angVel00 = V4LoadA(&descs[0].data0->angularVelocity.x);
+	Vec4V angVel10 = V4LoadA(&descs[1].data0->angularVelocity.x);
+	Vec4V angVel20 = V4LoadA(&descs[2].data0->angularVelocity.x);
+	Vec4V angVel30 = V4LoadA(&descs[3].data0->angularVelocity.x);
+
+	Vec4V angVel01 = V4LoadA(&descs[0].data1->angularVelocity.x);
+	Vec4V angVel11 = V4LoadA(&descs[1].data1->angularVelocity.x);
+	Vec4V angVel21 = V4LoadA(&descs[2].data1->angularVelocity.x);
+	Vec4V angVel31 = V4LoadA(&descs[3].data1->angularVelocity.x);
+
+	Vec4V linVelT00, linVelT10, linVelT20;
+	Vec4V linVelT01, linVelT11, linVelT21;
+	Vec4V angVelT00, angVelT10, angVelT20;
+	Vec4V angVelT01, angVelT11, angVelT21;
+
+	PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVelT00, linVelT10, linVelT20);
+	PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVelT01, linVelT11, linVelT21);
+	PX_TRANSPOSE_44_34(angVel00, angVel10, angVel20, angVel30, angVelT00, angVelT10, angVelT20);
+	PX_TRANSPOSE_44_34(angVel01, angVel11, angVel21, angVel31, angVelT01, angVelT11, angVelT21);
+
+	const Vec4V vrelX = V4Sub(linVelT00, linVelT01);
+	const Vec4V vrelY = V4Sub(linVelT10, linVelT11);
+	const Vec4V vrelZ = V4Sub(linVelT20, linVelT21);
+
+
+
+	//Load up masses and invInertia
+
+	const Vec4V invMass0 = V4Merge(FLoad(descs[0].data0->invMass), FLoad(descs[1].data0->invMass), FLoad(descs[2].data0->invMass),
+		FLoad(descs[3].data0->invMass));
+
+	const Vec4V invMass1 = V4Merge(FLoad(descs[0].data1->invMass), FLoad(descs[1].data1->invMass), FLoad(descs[2].data1->invMass),
+		FLoad(descs[3].data1->invMass));
+
+	const Vec4V invMass0_dom0fV = V4Mul(dom0, invMass0);
+	const Vec4V invMass1_dom1fV = V4Mul(dom1, invMass1);
+
+	Vec4V invInertia00X = Vec4V_From_Vec3V(V3LoadU(descs[0].data0->sqrtInvInertia.column0));
+	Vec4V invInertia00Y = Vec4V_From_Vec3V(V3LoadU(descs[0].data0->sqrtInvInertia.column1));
+	Vec4V invInertia00Z = Vec4V_From_Vec3V(V3LoadU(descs[0].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia10X = Vec4V_From_Vec3V(V3LoadU(descs[1].data0->sqrtInvInertia.column0));
+	Vec4V invInertia10Y = Vec4V_From_Vec3V(V3LoadU(descs[1].data0->sqrtInvInertia.column1));
+	Vec4V invInertia10Z = Vec4V_From_Vec3V(V3LoadU(descs[1].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia20X = Vec4V_From_Vec3V(V3LoadU(descs[2].data0->sqrtInvInertia.column0));
+	Vec4V invInertia20Y = Vec4V_From_Vec3V(V3LoadU(descs[2].data0->sqrtInvInertia.column1));
+	Vec4V invInertia20Z = Vec4V_From_Vec3V(V3LoadU(descs[2].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia30X = Vec4V_From_Vec3V(V3LoadU(descs[3].data0->sqrtInvInertia.column0));
+	Vec4V invInertia30Y = Vec4V_From_Vec3V(V3LoadU(descs[3].data0->sqrtInvInertia.column1));
+	Vec4V invInertia30Z = Vec4V_From_Vec3V(V3LoadU(descs[3].data0->sqrtInvInertia.column2));
+
+	Vec4V invInertia01X = Vec4V_From_Vec3V(V3LoadU(descs[0].data1->sqrtInvInertia.column0));
+	Vec4V invInertia01Y = Vec4V_From_Vec3V(V3LoadU(descs[0].data1->sqrtInvInertia.column1));
+	Vec4V invInertia01Z = Vec4V_From_Vec3V(V3LoadU(descs[0].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia11X = Vec4V_From_Vec3V(V3LoadU(descs[1].data1->sqrtInvInertia.column0));
+	Vec4V invInertia11Y = Vec4V_From_Vec3V(V3LoadU(descs[1].data1->sqrtInvInertia.column1));
+	Vec4V invInertia11Z = Vec4V_From_Vec3V(V3LoadU(descs[1].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia21X = Vec4V_From_Vec3V(V3LoadU(descs[2].data1->sqrtInvInertia.column0));
+	Vec4V invInertia21Y = Vec4V_From_Vec3V(V3LoadU(descs[2].data1->sqrtInvInertia.column1));
+	Vec4V invInertia21Z = Vec4V_From_Vec3V(V3LoadU(descs[2].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia31X = Vec4V_From_Vec3V(V3LoadU(descs[3].data1->sqrtInvInertia.column0));
+	Vec4V invInertia31Y = Vec4V_From_Vec3V(V3LoadU(descs[3].data1->sqrtInvInertia.column1));
+	Vec4V invInertia31Z = Vec4V_From_Vec3V(V3LoadU(descs[3].data1->sqrtInvInertia.column2));
+
+	Vec4V invInertia0X0, invInertia0X1, invInertia0X2;
+	Vec4V invInertia0Y0, invInertia0Y1, invInertia0Y2;
+	Vec4V invInertia0Z0, invInertia0Z1, invInertia0Z2;
+
+	Vec4V invInertia1X0, invInertia1X1, invInertia1X2;
+	Vec4V invInertia1Y0, invInertia1Y1, invInertia1Y2;
+	Vec4V invInertia1Z0, invInertia1Z1, invInertia1Z2;
+
+	PX_TRANSPOSE_44_34(invInertia00X, invInertia10X, invInertia20X, invInertia30X, invInertia0X0, invInertia0Y0, invInertia0Z0);
+	PX_TRANSPOSE_44_34(invInertia00Y, invInertia10Y, invInertia20Y, invInertia30Y, invInertia0X1, invInertia0Y1, invInertia0Z1);
+	PX_TRANSPOSE_44_34(invInertia00Z, invInertia10Z, invInertia20Z, invInertia30Z, invInertia0X2, invInertia0Y2, invInertia0Z2);
+
+	PX_TRANSPOSE_44_34(invInertia01X, invInertia11X, invInertia21X, invInertia31X, invInertia1X0, invInertia1Y0, invInertia1Z0);
+	PX_TRANSPOSE_44_34(invInertia01Y, invInertia11Y, invInertia21Y, invInertia31Y, invInertia1X1, invInertia1Y1, invInertia1Z1);
+	PX_TRANSPOSE_44_34(invInertia01Z, invInertia11Z, invInertia21Z, invInertia31Z, invInertia1X2, invInertia1Y2, invInertia1Z2);
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	//const Vec4V p84 = V4Splat(p8);
+	const Vec4V p1 = V4Splat(FLoad(0.1f));
+	const Vec4V bounceThreshold = V4Splat(FLoad(bounceThresholdF32));
+	const Vec4V orthoThreshold = V4Splat(FLoad(0.70710678f));
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+	const Vec3V bodyFrame00p = V3LoadU(descs[0].bodyFrame0.p);
+	const Vec3V bodyFrame01p = V3LoadU(descs[1].bodyFrame0.p);
+	const Vec3V bodyFrame02p = V3LoadU(descs[2].bodyFrame0.p);
+	const Vec3V bodyFrame03p = V3LoadU(descs[3].bodyFrame0.p);
+
+	Vec4V bodyFrame00p4 = Vec4V_From_Vec3V(bodyFrame00p);
+	Vec4V bodyFrame01p4 = Vec4V_From_Vec3V(bodyFrame01p);
+	Vec4V bodyFrame02p4 = Vec4V_From_Vec3V(bodyFrame02p);
+	Vec4V bodyFrame03p4 = Vec4V_From_Vec3V(bodyFrame03p);
+
+	Vec4V bodyFrame0pX, bodyFrame0pY, bodyFrame0pZ;
+	PX_TRANSPOSE_44_34(bodyFrame00p4, bodyFrame01p4, bodyFrame02p4, bodyFrame03p4, bodyFrame0pX, bodyFrame0pY, bodyFrame0pZ);
+
+	
+	const Vec3V bodyFrame10p = V3LoadU(descs[0].bodyFrame1.p);
+	const Vec3V bodyFrame11p = V3LoadU(descs[1].bodyFrame1.p);
+	const Vec3V bodyFrame12p = V3LoadU(descs[2].bodyFrame1.p);
+	const Vec3V bodyFrame13p = V3LoadU(descs[3].bodyFrame1.p);
+
+	Vec4V bodyFrame10p4 = Vec4V_From_Vec3V(bodyFrame10p);
+	Vec4V bodyFrame11p4 = Vec4V_From_Vec3V(bodyFrame11p);
+	Vec4V bodyFrame12p4 = Vec4V_From_Vec3V(bodyFrame12p);
+	Vec4V bodyFrame13p4 = Vec4V_From_Vec3V(bodyFrame13p);
+
+	Vec4V bodyFrame1pX, bodyFrame1pY, bodyFrame1pZ;
+	PX_TRANSPOSE_44_34(bodyFrame10p4, bodyFrame11p4, bodyFrame12p4, bodyFrame13p4, bodyFrame1pX, bodyFrame1pY, bodyFrame1pZ);
+
+	
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	PxU32 frictionIndex0 = 0, frictionIndex1 = 0, frictionIndex2 = 0, frictionIndex3 = 0;
+
+
+	PxU32 maxPatches = PxMax(descs[0].numFrictionPatches, PxMax(descs[1].numFrictionPatches, PxMax(descs[2].numFrictionPatches, descs[3].numFrictionPatches)));
+	PxU32 maxContacts = numContactPoints4;
+
+	//This is the address at which the first friction patch exists
+	PxU8* ptr2 = ptr + ((sizeof(SolverContactCoulombHeader4) * maxPatches) + constraintSize * maxContacts);
+
+	//PxU32 contactId = 0;
+
+	for(PxU32 i=0;i<maxPatches;i++)
+	{
+		const bool hasFinished0 = i >= descs[0].numFrictionPatches;
+		const bool hasFinished1 = i >= descs[1].numFrictionPatches;
+		const bool hasFinished2 = i >= descs[2].numFrictionPatches;
+		const bool hasFinished3 = i >= descs[3].numFrictionPatches;
+
+
+		frictionIndex0 = hasFinished0 ? frictionIndex0 : descs[0].startFrictionPatchIndex + i;
+		frictionIndex1 = hasFinished1 ? frictionIndex1 : descs[1].startFrictionPatchIndex + i;
+		frictionIndex2 = hasFinished2 ? frictionIndex2 : descs[2].startFrictionPatchIndex + i;
+		frictionIndex3 = hasFinished3 ? frictionIndex3 : descs[3].startFrictionPatchIndex + i;
+
+		PxU32 clampedContacts0 = hasFinished0 ? 0 : c.frictionPatchContactCounts[frictionIndex0];
+		PxU32 clampedContacts1 = hasFinished1 ? 0 : c.frictionPatchContactCounts[frictionIndex1];
+		PxU32 clampedContacts2 = hasFinished2 ? 0 : c.frictionPatchContactCounts[frictionIndex2];
+		PxU32 clampedContacts3 = hasFinished3 ? 0 : c.frictionPatchContactCounts[frictionIndex3];
+
+		PxU32 clampedFric0 = clampedContacts0 * numFrictionPerPoint;
+		PxU32 clampedFric1 = clampedContacts1 * numFrictionPerPoint;
+		PxU32 clampedFric2 = clampedContacts2 * numFrictionPerPoint;
+		PxU32 clampedFric3 = clampedContacts3 * numFrictionPerPoint;
+
+
+		const PxU32 numContacts = PxMax(clampedContacts0, PxMax(clampedContacts1, PxMax(clampedContacts2, clampedContacts3)));
+		const PxU32 numFrictions = PxMax(clampedFric0, PxMax(clampedFric1, PxMax(clampedFric2, clampedFric3)));
+
+		PxU32 firstPatch0 = c.correlationListHeads[frictionIndex0];
+		PxU32 firstPatch1 = c.correlationListHeads[frictionIndex1];
+		PxU32 firstPatch2 = c.correlationListHeads[frictionIndex2];
+		PxU32 firstPatch3 = c.correlationListHeads[frictionIndex3];
+
+		const Gu::ContactPoint* contactBase0 = descs[0].contacts + c.contactPatches[firstPatch0].start;
+		const Gu::ContactPoint* contactBase1 = descs[1].contacts + c.contactPatches[firstPatch1].start;
+		const Gu::ContactPoint* contactBase2 = descs[2].contacts + c.contactPatches[firstPatch2].start;
+		const Gu::ContactPoint* contactBase3 = descs[3].contacts + c.contactPatches[firstPatch3].start;
+
+		const Vec4V restitution = V4Merge(FLoad(contactBase0->restitution), FLoad(contactBase1->restitution), FLoad(contactBase2->restitution),
+			FLoad(contactBase3->restitution));
+
+		const Vec4V staticFriction = V4Merge(FLoad(contactBase0->staticFriction), FLoad(contactBase1->staticFriction), FLoad(contactBase2->staticFriction),
+			FLoad(contactBase3->staticFriction));
+
+		SolverContactCoulombHeader4* PX_RESTRICT header = reinterpret_cast<SolverContactCoulombHeader4*>(ptr);
+
+		header->frictionOffset = PxU16(ptr2 - ptr);
+
+		ptr += sizeof(SolverContactCoulombHeader4);	
+
+		SolverFrictionHeader4* PX_RESTRICT fricHeader = reinterpret_cast<SolverFrictionHeader4*>(ptr2);
+		ptr2 += sizeof(SolverFrictionHeader4) + sizeof(Vec4V) * numContacts;
+
+
+		header->numNormalConstr0 = Ps::to8(clampedContacts0);
+		header->numNormalConstr1 = Ps::to8(clampedContacts1);
+		header->numNormalConstr2 = Ps::to8(clampedContacts2);
+		header->numNormalConstr3 = Ps::to8(clampedContacts3);
+		header->numNormalConstr = Ps::to8(numContacts);
+		header->invMassADom = invMass0_dom0fV;
+		header->invMassBDom = invMass1_dom1fV;
+		header->angD0 = angDom0;
+		header->angD1 = angDom1;
+		header->restitution = restitution;
+
+		header->flags[0] = flags[0]; header->flags[1] = flags[1]; header->flags[2] = flags[2]; header->flags[3] = flags[3];
+
+		header->type = Ps::to8(isDynamic ? DY_SC_TYPE_BLOCK_RB_CONTACT : DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT);
+		header->shapeInteraction[0] = descs[0].shapeInteraction; header->shapeInteraction[1] = descs[1].shapeInteraction;
+		header->shapeInteraction[2] = descs[2].shapeInteraction; header->shapeInteraction[3] = descs[3].shapeInteraction;
+
+
+		fricHeader->invMassADom = invMass0_dom0fV;
+		fricHeader->invMassBDom = invMass1_dom1fV;
+		fricHeader->angD0 = angDom0;
+		fricHeader->angD1 = angDom1;
+		fricHeader->numFrictionConstr0 = Ps::to8(clampedFric0);
+		fricHeader->numFrictionConstr1 = Ps::to8(clampedFric1);
+		fricHeader->numFrictionConstr2 = Ps::to8(clampedFric2);
+		fricHeader->numFrictionConstr3 = Ps::to8(clampedFric3);
+		fricHeader->numNormalConstr = Ps::to8(numContacts);
+		fricHeader->numNormalConstr0 = Ps::to8(clampedContacts0);
+		fricHeader->numNormalConstr1 = Ps::to8(clampedContacts1);
+		fricHeader->numNormalConstr2 = Ps::to8(clampedContacts2);
+		fricHeader->numNormalConstr3 = Ps::to8(clampedContacts3);
+		fricHeader->type = Ps::to8(isDynamic ? DY_SC_TYPE_BLOCK_FRICTION : DY_SC_TYPE_BLOCK_STATIC_FRICTION);
+		fricHeader->staticFriction = staticFriction;
+		fricHeader->frictionPerContact = PxU32(numFrictionPerPoint == 2 ? 1 : 0);
+
+		fricHeader->numFrictionConstr = Ps::to8(numFrictions);
+		
+		Vec4V normal0 = V4LoadA(&contactBase0->normal.x);
+		Vec4V normal1 = V4LoadA(&contactBase1->normal.x);
+		Vec4V normal2 = V4LoadA(&contactBase2->normal.x);
+		Vec4V normal3 = V4LoadA(&contactBase3->normal.x);
+
+		Vec4V normalX, normalY, normalZ;
+		PX_TRANSPOSE_44_34(normal0, normal1, normal2, normal3, normalX, normalY, normalZ);
+		header->normalX = normalX;
+		header->normalY = normalY;
+		header->normalZ = normalZ;
+
+		const Vec4V normalLenSq = V4MulAdd(normalZ, normalZ, V4MulAdd(normalY, normalY, V4Mul(normalX, normalX)));
+
+		const Vec4V linNorVel0 = V4MulAdd(normalZ, linVelT20, V4MulAdd(normalY, linVelT10, V4Mul(normalX, linVelT00)));
+		const Vec4V linNorVel1 = V4MulAdd(normalZ, linVelT21, V4MulAdd(normalY, linVelT11, V4Mul(normalX, linVelT01)));
+
+		const Vec4V invMassNorLenSq0 = V4Mul(invMass0_dom0fV, normalLenSq);
+		const Vec4V invMassNorLenSq1 = V4Mul(invMass1_dom1fV, normalLenSq);		
+
+
+		//Calculate friction directions
+		const BoolV cond =V4IsGrtr(orthoThreshold, V4Abs(normalX));
+
+		const Vec4V t0FallbackX = V4Sel(cond, zero, V4Neg(normalY));
+		const Vec4V t0FallbackY = V4Sel(cond, V4Neg(normalZ), normalX);
+		const Vec4V t0FallbackZ = V4Sel(cond, normalY, zero);
+
+		const Vec4V dotNormalVrel = V4MulAdd(normalZ, vrelZ, V4MulAdd(normalY, vrelY, V4Mul(normalX, vrelX)));
+		const Vec4V vrelSubNorVelX = V4NegMulSub(normalX, dotNormalVrel, vrelX);
+		const Vec4V vrelSubNorVelY = V4NegMulSub(normalY, dotNormalVrel, vrelY);
+		const Vec4V vrelSubNorVelZ = V4NegMulSub(normalZ, dotNormalVrel, vrelZ);
+
+		const Vec4V lenSqvrelSubNorVelZ = V4MulAdd(vrelSubNorVelX, vrelSubNorVelX, V4MulAdd(vrelSubNorVelY, vrelSubNorVelY, V4Mul(vrelSubNorVelZ, vrelSubNorVelZ)));
+
+		const BoolV bcon2 = V4IsGrtr(lenSqvrelSubNorVelZ, p1);
+
+		Vec4V t0X = V4Sel(bcon2, vrelSubNorVelX, t0FallbackX);
+		Vec4V t0Y = V4Sel(bcon2, vrelSubNorVelY, t0FallbackY);
+		Vec4V t0Z = V4Sel(bcon2, vrelSubNorVelZ, t0FallbackZ);
+
+		//Now normalize this...
+		const Vec4V recipLen = V4Rsqrt(V4MulAdd(t0X, t0X, V4MulAdd(t0Y, t0Y, V4Mul(t0Z, t0Z))));
+
+		t0X = V4Mul(t0X, recipLen);
+		t0Y = V4Mul(t0Y, recipLen);
+		t0Z = V4Mul(t0Z, recipLen);
+
+		const Vec4V t1X = V4NegMulSub(normalZ, t0Y, V4Mul(normalY, t0Z));
+		const Vec4V t1Y = V4NegMulSub(normalX, t0Z, V4Mul(normalZ, t0X));
+		const Vec4V t1Z = V4NegMulSub(normalY, t0X, V4Mul(normalX, t0Y));
+
+		const Vec4V tFallbackX[2] = {t0X, t1X};
+		const Vec4V tFallbackY[2] = {t0Y, t1Y};
+		const Vec4V tFallbackZ[2] = {t0Z, t1Z};
+
+
+		//For all correlation heads - need to pull this out I think
+
+		//OK, we have a counter for all our patches...
+		PxU32 finished = (PxU32(hasFinished0)) | 
+						 ((PxU32(hasFinished1)) << 1) | 
+						 ((PxU32(hasFinished2)) << 2) | 
+						 ((PxU32(hasFinished3)) << 3);
+
+		CorrelationListIterator iter0(c, firstPatch0);
+		CorrelationListIterator iter1(c, firstPatch1);
+		CorrelationListIterator iter2(c, firstPatch2);
+		CorrelationListIterator iter3(c, firstPatch3);
+
+		PxU32 contact0, contact1, contact2, contact3;
+		PxU32 patch0, patch1, patch2, patch3;
+
+		iter0.nextContact(patch0, contact0);
+		iter1.nextContact(patch1, contact1);
+		iter2.nextContact(patch2, contact2);
+		iter3.nextContact(patch3, contact3);
+
+		PxU8* p = ptr;
+
+		PxU32 contactCount = 0;
+		PxU32 newFinished = 
+			(PxU32(hasFinished0 || !iter0.hasNextContact()))		| 
+			((PxU32(hasFinished1 || !iter1.hasNextContact())) << 1) | 
+			((PxU32(hasFinished2 || !iter2.hasNextContact())) << 2) | 
+			((PxU32(hasFinished3 || !iter3.hasNextContact())) << 3);
+
+		PxU32 fricIndex = 0;
+
+		while(finished != 0xf)
+		{
+			finished = newFinished;
+			++contactCount;
+			Ps::prefetchLine(p, 384);
+			Ps::prefetchLine(p, 512);
+			Ps::prefetchLine(p, 640);	
+
+			SolverContact4Base* PX_RESTRICT solverContact = reinterpret_cast<SolverContact4Base*>(p);
+			p += constraintSize;
+
+			const Gu::ContactPoint& con0 = descs[0].contacts[c.contactPatches[patch0].start + contact0];
+			const Gu::ContactPoint& con1 = descs[1].contacts[c.contactPatches[patch1].start + contact1];
+			const Gu::ContactPoint& con2 = descs[2].contacts[c.contactPatches[patch2].start + contact2];
+			const Gu::ContactPoint& con3 = descs[3].contacts[c.contactPatches[patch3].start + contact3];
+
+			//Now we need to splice these 4 contacts into a single structure
+
+			{
+				Vec4V point0 = V4LoadA(&con0.point.x);
+				Vec4V point1 = V4LoadA(&con1.point.x);
+				Vec4V point2 = V4LoadA(&con2.point.x);
+				Vec4V point3 = V4LoadA(&con3.point.x);
+
+				Vec4V pointX, pointY, pointZ;
+				PX_TRANSPOSE_44_34(point0, point1, point2, point3, pointX, pointY, pointZ);
+
+				Vec4V targetVel0 = V4LoadA(&con0.targetVel.x);
+				Vec4V targetVel1 = V4LoadA(&con1.targetVel.x);
+				Vec4V targetVel2 = V4LoadA(&con2.targetVel.x);
+				Vec4V targetVel3 = V4LoadA(&con3.targetVel.x);
+
+				Vec4V targetVelX, targetVelY, targetVelZ;
+				PX_TRANSPOSE_44_34(targetVel0, targetVel1, targetVel2, targetVel3, targetVelX, targetVelY, targetVelZ);
+
+				const Vec4V raX = V4Sub(pointX, bodyFrame0pX);
+				const Vec4V raY = V4Sub(pointY, bodyFrame0pY);
+				const Vec4V raZ = V4Sub(pointZ, bodyFrame0pZ);
+
+				const Vec4V rbX = V4Sub(pointX, bodyFrame1pX);
+				const Vec4V rbY = V4Sub(pointY, bodyFrame1pY);
+				const Vec4V rbZ = V4Sub(pointZ, bodyFrame1pZ);
+
+				{
+					const Vec4V separation = V4Merge(FLoad(con0.separation), FLoad(con1.separation), FLoad(con2.separation),
+						FLoad(con3.separation));
+					const Vec4V maxImpulse = V4Merge(FLoad(con0.maxImpulse), FLoad(con1.maxImpulse), FLoad(con2.maxImpulse),
+						FLoad(con3.maxImpulse));
+
+					const Vec4V cTargetVel = V4MulAdd(normalX, targetVelX, V4MulAdd(normalY, targetVelY, V4Mul(normalZ, targetVelZ)));
+
+					//raXn = cross(ra, normal) which = Vec3V( a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+					const Vec4V raXnX = V4NegMulSub(raZ, normalY, V4Mul(raY, normalZ));
+					const Vec4V raXnY = V4NegMulSub(raX, normalZ, V4Mul(raZ, normalX));
+					const Vec4V raXnZ = V4NegMulSub(raY, normalX, V4Mul(raX, normalY));
+
+					const Vec4V v0a0 = V4Mul(invInertia0X0, raXnX);
+					const Vec4V v0a1 = V4Mul(invInertia0X1, raXnX);
+					const Vec4V v0a2 = V4Mul(invInertia0X2, raXnX);
+
+					const Vec4V v0PlusV1a0 = V4MulAdd(invInertia0Y0, raXnY, v0a0);
+					const Vec4V v0PlusV1a1 = V4MulAdd(invInertia0Y1, raXnY, v0a1);
+					const Vec4V v0PlusV1a2 = V4MulAdd(invInertia0Y2, raXnY, v0a2);
+
+					const Vec4V delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, v0PlusV1a0);
+					const Vec4V delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, v0PlusV1a1);
+					const Vec4V delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, v0PlusV1a2);
+
+					const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0Z, delAngVel0Z, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0X, delAngVel0X)));
+					const Vec4V dotRaXnAngVel0 = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4Mul(raXnX, angVelT00)));
+
+					Vec4V unitResponse = V4Add(invMassNorLenSq0, dotDelAngVel0);
+					Vec4V vrel = V4Add(linNorVel0, dotRaXnAngVel0);
+
+
+					//The dynamic-only parts - need to if-statement these up. A branch here shouldn't cost us too much
+					if(isDynamic)
+					{
+						SolverContact4Dynamic* PX_RESTRICT dynamicContact = static_cast<SolverContact4Dynamic*>(solverContact);
+						const Vec4V rbXnX = V4NegMulSub(rbZ, normalY, V4Mul(rbY, normalZ));
+						const Vec4V rbXnY = V4NegMulSub(rbX, normalZ, V4Mul(rbZ, normalX));
+						const Vec4V rbXnZ = V4NegMulSub(rbY, normalX, V4Mul(rbX, normalY));
+
+						const Vec4V v0b0 = V4Mul(invInertia1X0, rbXnX);
+						const Vec4V v0b1 = V4Mul(invInertia1X1, rbXnX);
+						const Vec4V v0b2 = V4Mul(invInertia1X2, rbXnX);
+
+						const Vec4V v0PlusV1b0 = V4MulAdd(invInertia1Y0, rbXnY, v0b0);
+						const Vec4V v0PlusV1b1 = V4MulAdd(invInertia1Y1, rbXnY, v0b1);
+						const Vec4V v0PlusV1b2 = V4MulAdd(invInertia1Y2, rbXnY, v0b2);
+
+						const Vec4V delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, v0PlusV1b0);
+						const Vec4V delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, v0PlusV1b1);
+						const Vec4V delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, v0PlusV1b2);
+
+
+						//V3Dot(raXn, delAngVel0)
+						
+						const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1Z, delAngVel1Z, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1X, delAngVel1X)));
+						
+						const Vec4V dotRbXnAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
+
+						const Vec4V resp1 = V4Add(dotDelAngVel1, invMassNorLenSq1);
+
+						unitResponse = V4Add(unitResponse, resp1);
+
+						const Vec4V vrel2 = V4Add(linNorVel1, dotRbXnAngVel1);
+						vrel = V4Sub(vrel, vrel2);
+
+						//These are for dynamic-only contacts.
+						dynamicContact->rbXnX = delAngVel1X;
+						dynamicContact->rbXnY = delAngVel1Y;
+						dynamicContact->rbXnZ = delAngVel1Z;
+
+					}
+
+					const Vec4V velMultiplier = V4Sel(V4IsGrtr(unitResponse, zero), V4Recip(unitResponse), zero);
+
+					const Vec4V penetration = V4Sub(separation, restDistance);
+
+					const Vec4V penInvDtp8 = V4Max(maxPenBias, V4Scale(penetration, invDtp8));
+
+					Vec4V scaledBias = V4Mul(velMultiplier, penInvDtp8);
+
+					const Vec4V penetrationInvDt = V4Scale(penetration, invDt);
+
+					const BoolV isGreater2 = BAnd(BAnd(V4IsGrtr(restitution, zero), V4IsGrtr(bounceThreshold, vrel)), 
+						V4IsGrtr(V4Neg(vrel), penetrationInvDt));
+
+					const BoolV ccdSeparationCondition = V4IsGrtrOrEq(ccdMaxSeparation, penetration);
+
+					scaledBias = V4Sel(BAnd(ccdSeparationCondition, isGreater2), zero, scaledBias);
+
+					const Vec4V sumVRel(vrel);
+
+					const Vec4V targetVelocity = V4Sub(V4Add(V4Sel(isGreater2, V4Mul(V4Neg(sumVRel), restitution), zero), cTargetVel), vrel);
+
+					//These values are present for static and dynamic contacts			
+					solverContact->raXnX = delAngVel0X;
+					solverContact->raXnY = delAngVel0Y;
+					solverContact->raXnZ = delAngVel0Z;
+					solverContact->velMultiplier = velMultiplier;
+					solverContact->appliedForce = zero;
+					solverContact->scaledBias = scaledBias;
+					solverContact->targetVelocity = targetVelocity;
+					solverContact->maxImpulse = maxImpulse;	
+				}
+
+				//PxU32 conId = contactId++;
+
+				/*Vec4V targetVel0 = V4LoadA(&con0.targetVel.x);
+				Vec4V targetVel1 = V4LoadA(&con1.targetVel.x);
+				Vec4V targetVel2 = V4LoadA(&con2.targetVel.x);
+				Vec4V targetVel3 = V4LoadA(&con3.targetVel.x);
+
+				Vec4V targetVelX, targetVelY, targetVelZ;
+				PX_TRANSPOSE_44_34(targetVel0, targetVel1, targetVel2, targetVel3, targetVelX, targetVelY, targetVelZ);*/
+
+				for(PxU32 a = 0; a < numFrictionPerPoint; ++a)
+				{
+					SolverFriction4Base* PX_RESTRICT friction = reinterpret_cast<SolverFriction4Base*>(ptr2);
+
+					ptr2 += frictionSize;
+
+					const Vec4V tX = tFallbackX[fricIndex];
+					const Vec4V tY = tFallbackY[fricIndex];
+					const Vec4V tZ = tFallbackZ[fricIndex];
+
+					fricIndex = 1 - fricIndex;
+
+					const Vec4V raXnX = V4NegMulSub(raZ, tY, V4Mul(raY, tZ));
+					const Vec4V raXnY = V4NegMulSub(raX, tZ, V4Mul(raZ, tX));
+					const Vec4V raXnZ = V4NegMulSub(raY, tX, V4Mul(raX, tY));
+
+					const Vec4V v0a0 = V4Mul(invInertia0X0, raXnX);
+					const Vec4V v0a1 = V4Mul(invInertia0X1, raXnX);
+					const Vec4V v0a2 = V4Mul(invInertia0X2, raXnX);
+
+					const Vec4V v0PlusV1a0 = V4MulAdd(invInertia0Y0, raXnY, v0a0);
+					const Vec4V v0PlusV1a1 = V4MulAdd(invInertia0Y1, raXnY, v0a1);
+					const Vec4V v0PlusV1a2 = V4MulAdd(invInertia0Y2, raXnY, v0a2);
+
+					const Vec4V delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, v0PlusV1a0);
+					const Vec4V delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, v0PlusV1a1);
+					const Vec4V delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, v0PlusV1a2);
+
+					const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0Z, delAngVel0Z, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0X, delAngVel0X)));
+
+					const Vec4V norVel0 = V4MulAdd(tX, linVelT00, V4MulAdd(tY, linVelT10, V4Mul(tZ, linVelT20)));
+					const Vec4V dotRaXnAngVel0 = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4Mul(raXnX, angVelT00)));
+					Vec4V vrel = V4Add(norVel0, dotRaXnAngVel0);
+					
+					Vec4V unitResponse = V4Add(invMass0_dom0fV, dotDelAngVel0);
+					
+					if(isDynamic)
+					{
+						SolverFriction4Dynamic* PX_RESTRICT dFric = static_cast<SolverFriction4Dynamic*>(friction);
+
+						const Vec4V rbXnX = V4NegMulSub(rbZ, tY, V4Mul(rbY, tZ));
+						const Vec4V rbXnY = V4NegMulSub(rbX, tZ, V4Mul(rbZ, tX));
+						const Vec4V rbXnZ = V4NegMulSub(rbY, tX, V4Mul(rbX, tY));
+						
+						const Vec4V v0b0 = V4Mul(invInertia1X0, rbXnX);
+						const Vec4V v0b1 = V4Mul(invInertia1X1, rbXnX);
+						const Vec4V v0b2 = V4Mul(invInertia1X2, rbXnX);
+
+						const Vec4V v0PlusV1b0 = V4MulAdd(invInertia1Y0, rbXnY, v0b0);
+						const Vec4V v0PlusV1b1 = V4MulAdd(invInertia1Y1, rbXnY, v0b1);
+						const Vec4V v0PlusV1b2 = V4MulAdd(invInertia1Y2, rbXnY, v0b2);
+
+						const Vec4V delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, v0PlusV1b0);
+						const Vec4V delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, v0PlusV1b1);
+						const Vec4V delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, v0PlusV1b2);
+
+						const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1Z, delAngVel1Z, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1X, delAngVel1X)));
+						
+						const Vec4V norVel1 = V4MulAdd(tX, linVelT01, V4MulAdd(tY, linVelT11, V4Mul(tZ, linVelT21)));
+						const Vec4V dotRbXnAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
+						vrel = V4Sub(vrel, V4Add(norVel1, dotRbXnAngVel1));
+				
+						const Vec4V resp1 = V4Add(dotDelAngVel1, invMassNorLenSq1);
+
+						unitResponse = V4Add(unitResponse, resp1);
+
+						dFric->rbXnX = delAngVel1X;
+						dFric->rbXnY = delAngVel1Y;
+						dFric->rbXnZ = delAngVel1Z;
+					}
+
+					const Vec4V velMultiplier = V4Neg(V4Sel(V4IsGrtr(unitResponse, zero), V4Recip(unitResponse), zero));
+
+					friction->appliedForce = zero;
+					friction->raXnX = delAngVel0X;
+					friction->raXnY = delAngVel0Y;
+					friction->raXnZ = delAngVel0Z;
+					friction->velMultiplier = velMultiplier;
+					friction->targetVelocity = V4Sub(V4MulAdd(targetVelZ, tZ, V4MulAdd(targetVelY, tY, V4Mul(targetVelX, tX))), vrel);
+					friction->normalX = tX;
+					friction->normalY = tY;
+					friction->normalZ = tZ;
+				}
+			}
+			if(!(finished & 0x1))
+			{
+				iter0.nextContact(patch0, contact0);
+				newFinished |= PxU32(!iter0.hasNextContact());
+			}
+
+			if(!(finished & 0x2))
+			{
+				iter1.nextContact(patch1, contact1);
+				newFinished |= (PxU32(!iter1.hasNextContact()) << 1);
+			}
+
+			if(!(finished & 0x4))
+			{
+				iter2.nextContact(patch2, contact2);
+				newFinished |= (PxU32(!iter2.hasNextContact()) << 2);
+			}
+
+			if(!(finished & 0x8))
+			{
+				iter3.nextContact(patch3, contact3);
+				newFinished |= (PxU32(!iter3.hasNextContact()) << 3);
+			}
+		}
+		ptr = p;
+	}
+	return true;
+}
+
+
+
+//The persistent friction patch correlation/allocation will already have happenned as this is per-pair.
+//This function just computes the size of the combined solve data.
+void computeBlockStreamByteSizesCoulomb4(PxSolverContactDesc* descs,
+								  ThreadContext& threadContext, const CorrelationBuffer& c,
+								  const PxU32 numFrictionPerPoint, 
+								PxU32& _solverConstraintByteSize, PxU32* _axisConstraintCount, PxU32& _numContactPoints4)
+{
+	PX_ASSERT(0 == _solverConstraintByteSize);
+	PX_UNUSED(threadContext);
+
+	PxU32 maxPatches = 0;
+	PxU32 maxContactCount[CorrelationBuffer::MAX_FRICTION_PATCHES];
+	PxU32 maxFrictionCount[CorrelationBuffer::MAX_FRICTION_PATCHES];
+	PxMemZero(maxContactCount, sizeof(maxContactCount));
+	PxMemZero(maxFrictionCount, sizeof(maxFrictionCount));
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		PxU32 axisConstraintCount = 0;
+
+		for(PxU32 i = 0; i < descs[a].numFrictionPatches; i++)
+		{
+			PxU32 ind = i + descs[a].startFrictionPatchIndex;
+
+			const FrictionPatch& frictionPatch = c.frictionPatches[ind];
+
+			const bool haveFriction = (frictionPatch.materialFlags & PxMaterialFlag::eDISABLE_FRICTION) == 0;
+			//Solver constraint data.
+			if(c.frictionPatchContactCounts[ind]!=0)
+			{
+				maxContactCount[i] = PxMax(c.frictionPatchContactCounts[ind], maxContactCount[i]);
+				axisConstraintCount += c.frictionPatchContactCounts[ind];
+
+				if(haveFriction)
+				{
+					//const PxU32 fricCount = c.frictionPatches[ind].numConstraints;
+					const PxU32 fricCount = c.frictionPatchContactCounts[ind] * numFrictionPerPoint;
+					maxFrictionCount[i] = PxMax(fricCount, maxFrictionCount[i]);
+					axisConstraintCount += fricCount;
+				}
+			}
+		}
+		maxPatches = PxMax(descs[a].numFrictionPatches, maxPatches);
+		_axisConstraintCount[a] = axisConstraintCount;
+	}
+
+	PxU32 totalContacts = 0, totalFriction = 0;
+	for(PxU32 a = 0; a < maxPatches; ++a)
+	{
+		totalContacts += maxContactCount[a];
+		totalFriction += maxFrictionCount[a];
+	}
+
+	_numContactPoints4 = totalContacts;
+
+
+	//OK, we have a given number of friction patches, contact points and friction constraints so we can calculate how much memory we need
+
+	const bool isStatic = (((descs[0].bodyState1 | descs[1].bodyState1 | descs[2].bodyState1 | descs[3].bodyState1) & PxSolverContactDesc::eDYNAMIC_BODY) == 0);
+
+	const PxU32 headerSize = (sizeof(SolverContactCoulombHeader4) + sizeof(SolverFrictionHeader4)) * maxPatches;
+	//Add on 1 Vec4V per contact for the applied force buffer
+	const PxU32 constraintSize = isStatic ? ((sizeof(SolverContact4Base) + sizeof(Vec4V)) * totalContacts) + ( sizeof(SolverFriction4Base) * totalFriction) : 
+		((sizeof(SolverContact4Dynamic) + sizeof(Vec4V)) * totalContacts) + (sizeof(SolverFriction4Dynamic) * totalFriction);
+
+	_solverConstraintByteSize =  ((constraintSize + headerSize + 0x0f) & ~0x0f);
+	PX_ASSERT(0 == (_solverConstraintByteSize & 0x0f));
+}
+
+
+static SolverConstraintPrepState::Enum reserveBlockStreamsCoulomb4(PxSolverContactDesc* descs, ThreadContext& threadContext, const CorrelationBuffer& c,
+						PxU8*& solverConstraint, const PxU32 numFrictionPerContactPoint,
+						PxU32& solverConstraintByteSize,
+						PxU32* axisConstraintCount, PxU32& numContactPoints4, PxConstraintAllocator& constraintAllocator)
+{
+	PX_ASSERT(NULL == solverConstraint);
+	PX_ASSERT(0 == solverConstraintByteSize);
+
+	//From constraintBlockStream we need to reserve contact points, contact forces, and a char buffer for the solver constraint data (already have a variable for this).
+	//From frictionPatchStream we just need to reserve a single buffer.
+
+	//Compute the sizes of all the buffers.
+	computeBlockStreamByteSizesCoulomb4(
+		descs, threadContext, c, numFrictionPerContactPoint, solverConstraintByteSize,
+		axisConstraintCount, numContactPoints4);
+
+	//Reserve the buffers.
+
+	//First reserve the accumulated buffer size for the constraint block.
+	PxU8* constraintBlock = NULL;
+	const PxU32 constraintBlockByteSize = solverConstraintByteSize;
+	if(constraintBlockByteSize > 0)
+	{
+		if((constraintBlockByteSize + 16u) > 16384)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		constraintBlock = constraintAllocator.reserveConstraintData(constraintBlockByteSize + 16u);
+
+		if(0==constraintBlock || (reinterpret_cast<PxU8*>(-1))==constraintBlock)
+		{
+			if(0==constraintBlock)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of contact data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				constraintBlock=NULL;
+			}
+		}
+	}
+
+	//Patch up the individual ptrs to the buffer returned by the constraint block reservation (assuming the reservation didn't fail).
+	if(0==constraintBlockByteSize || constraintBlock)
+	{
+		if(solverConstraintByteSize)
+		{
+			solverConstraint = constraintBlock;
+			PX_ASSERT(0==(uintptr_t(solverConstraint) & 0x0f));
+		}
+	}
+
+	//Return true if neither of the two block reservations failed.
+	return ((0==constraintBlockByteSize || constraintBlock)) ? SolverConstraintPrepState::eSUCCESS : SolverConstraintPrepState::eOUT_OF_MEMORY;
+}
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb1D(
+	PxsContactManagerOutput** outputs,
+	ThreadContext& threadContext,
+	PxSolverContactDesc* blockDescs,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxConstraintAllocator& constraintAllocator)
+{
+	return createFinalizeSolverContacts4Coulomb(outputs, threadContext, blockDescs, invDtF32, bounceThresholdF32, 
+		frictionOffsetThreshold, correlationDistance, constraintAllocator, PxFrictionType::eONE_DIRECTIONAL);
+}
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb2D(
+	PxsContactManagerOutput** outputs,
+	ThreadContext& threadContext,
+	PxSolverContactDesc* blockDescs,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxConstraintAllocator& constraintAllocator)
+{
+	return createFinalizeSolverContacts4Coulomb(outputs, threadContext, blockDescs, invDtF32, bounceThresholdF32,
+		frictionOffsetThreshold, correlationDistance, constraintAllocator, PxFrictionType::eTWO_DIRECTIONAL);
+}
+
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb(
+								PxsContactManagerOutput** outputs,
+								 ThreadContext& threadContext,
+								 PxSolverContactDesc* blockDescs,
+								 const PxReal invDtF32,
+								 PxReal bounceThresholdF32,
+								 PxReal frictionOffsetThreshold,
+								 PxReal correlationDistance,
+								 PxConstraintAllocator& constraintAllocator,
+								 PxFrictionType::Enum frictionType)
+{
+	PX_UNUSED(frictionOffsetThreshold);
+	PX_UNUSED(correlationDistance);
+
+	for(PxU32 i = 0; i < 4; ++i)
+	{
+		blockDescs[i].desc->constraintLengthOver16 = 0;
+	}
+
+	PX_ASSERT(outputs[0]->nbContacts && outputs[1]->nbContacts && outputs[2]->nbContacts && outputs[3]->nbContacts);
+
+	Gu::ContactBuffer& buffer = threadContext.mContactBuffer;
+
+	buffer.count = 0;
+
+	PxU32 numContacts = 0;
+
+	CorrelationBuffer& c = threadContext.mCorrelationBuffer;
+
+	c.frictionPatchCount = 0;
+	c.contactPatchCount = 0;
+
+	PxU32 numFrictionPerPoint = PxU32(frictionType == PxFrictionType::eONE_DIRECTIONAL ? 1 : 2);
+
+	PX_ALIGN(16, PxReal invMassScale0[4]);
+	PX_ALIGN(16, PxReal invMassScale1[4]);
+	PX_ALIGN(16, PxReal invInertiaScale0[4]);
+	PX_ALIGN(16, PxReal invInertiaScale1[4]);
+	
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		PxSolverContactDesc& blockDesc = blockDescs[a];
+		PxSolverConstraintDesc& desc = *blockDesc.desc;
+		
+		//blockDesc.startContactIndex = numContacts;
+		blockDesc.contacts = &buffer.contacts[numContacts];
+
+		Ps::prefetchLine(desc.bodyA);
+		Ps::prefetchLine(desc.bodyB);
+
+		if((numContacts + outputs[a]->nbContacts) > 64)
+		{
+			return SolverConstraintPrepState::eUNBATCHABLE;
+		}
+		bool hasMaxImpulse, hasTargetVelocity;
+
+		const PxReal defaultMaxImpulse = PxMin(blockDesc.data0->maxContactImpulse, blockDesc.data1->maxContactImpulse);
+
+		PxU32 contactCount = extractContacts(buffer, *outputs[a], hasMaxImpulse, hasTargetVelocity, invMassScale0[a], invMassScale1[a], 
+			invInertiaScale0[a], invInertiaScale1[a], defaultMaxImpulse);
+
+		if(contactCount == 0)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		numContacts+=contactCount;
+
+		blockDesc.numContacts = contactCount;
+		blockDesc.hasMaxImpulse = hasMaxImpulse;
+	
+		blockDesc.startFrictionPatchIndex = c.frictionPatchCount;
+		blockDesc.startContactPatchIndex = c.contactPatchCount;
+
+		createContactPatches(c, blockDesc.contacts, contactCount, PXC_SAME_NORMAL);
+		
+		bool overflow = correlatePatches(c, blockDesc.contacts, blockDesc.bodyFrame0, blockDesc.bodyFrame1, PXC_SAME_NORMAL, blockDesc.startContactPatchIndex,
+			blockDesc.startFrictionPatchIndex);
+		if(overflow)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		blockDesc.numContactPatches = PxU16(c.contactPatchCount - blockDesc.startContactPatchIndex);
+		blockDesc.numFrictionPatches = c.frictionPatchCount - blockDesc.startFrictionPatchIndex;
+
+		invMassScale0[a] *= blockDesc.mInvMassScales.linear0;
+		invMassScale1[a] *= blockDesc.mInvMassScales.linear1;
+		invInertiaScale0[a] *= blockDesc.mInvMassScales.angular0;
+		invInertiaScale1[a] *= blockDesc.mInvMassScales.angular1;
+
+	}
+
+	//OK, now we need to work out how much memory to allocate, allocate it and then block-create the constraints...
+
+	PxU8* solverConstraint = NULL;
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 axisConstraintCount[4];
+	PxU32 numContactPoints4 = 0;
+
+	SolverConstraintPrepState::Enum state = reserveBlockStreamsCoulomb4(blockDescs, threadContext, c,
+												solverConstraint, numFrictionPerPoint, 
+												solverConstraintByteSize,
+												axisConstraintCount, numContactPoints4, constraintAllocator);
+
+	if(state != SolverConstraintPrepState::eSUCCESS)
+		return state;
+
+	//OK, we allocated the memory, now let's create the constraints
+
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		PxSolverConstraintDesc& desc = *blockDescs[a].desc;
+		//n[a]->solverConstraintPointer = solverConstraint;
+		desc.constraint = solverConstraint;
+
+		//KS - TODO - add back in counters for axisConstraintCount somewhere...
+		blockDescs[a].axisConstraintCount += Ps::to16(axisConstraintCount[a]);
+
+		desc.constraintLengthOver16 = Ps::to16(solverConstraintByteSize/16);
+
+		PxU32 writeBackLength = outputs[a]->nbContacts * sizeof(PxReal);
+		void* writeBack = outputs[a]->contactForces;
+		desc.writeBack = writeBack;
+		setWritebackLength(desc, writeBackLength);
+	}
+
+	const Vec4V iMassScale0 = V4LoadA(invMassScale0); 
+	const Vec4V iInertiaScale0 = V4LoadA(invInertiaScale0);
+	const Vec4V iMassScale1 = V4LoadA(invMassScale1); 
+	const Vec4V iInertiaScale1 = V4LoadA(invInertiaScale1);
+
+
+	bool hasFriction = setupFinalizeSolverConstraintsCoulomb4(blockDescs, solverConstraint, 
+											invDtF32, bounceThresholdF32, c, numFrictionPerPoint, numContactPoints4, solverConstraintByteSize,
+											iMassScale0, iInertiaScale0, iMassScale1, iInertiaScale1);
+
+	*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize)) = 0;
+	*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize + 4)) = hasFriction ? 0xFFFFFFFF : 0;
+
+
+	return SolverConstraintPrepState::eSUCCESS;
+}
+
+}
+
+}
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrepPF.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrepPF.cpp
new file mode 100644
index 00000000..4651605b
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrepPF.cpp
@@ -0,0 +1,650 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+     
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DySolverConstraintTypes.h"
+#include "PxcNpWorkUnit.h"
+#include "DyThreadContext.h"
+#include "DyContactPrep.h"
+#include "PxcNpContactPrepShared.h"
+//#include "PxvGeometry.h"
+#include "PxvDynamics.h"
+#include "DyCorrelationBuffer.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverBody.h"
+#include "DySolverContact4.h"
+#include "DySolverContactPF4.h"
+
+
+#include "PsVecMath.h"
+#include "PxContactModifyCallback.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DySolverExt.h"
+#include "DyArticulationContactPrep.h"
+#include "DyContactPrepShared.h"
+
+#include "PsFoundation.h"
+
+using namespace physx::Gu;
+using namespace physx::shdfnd::aos;
+
+namespace physx
+{
+namespace Dy
+{
+
+bool createFinalizeSolverContactsCoulomb(PxSolverContactDesc& contactDesc,
+		PxsContactManagerOutput& output,
+		ThreadContext& threadContext,
+		const PxReal invDtF32,
+		PxReal bounceThresholdF32,
+		PxReal frictionOffsetThreshold,
+		PxReal correlationDistance,
+		PxConstraintAllocator& constraintAllocator,
+		PxFrictionType::Enum frictionType);
+
+static bool setupFinalizeSolverConstraintsCoulomb(
+												  Sc::ShapeInteraction* shapeInteraction,
+						    const ContactBuffer& buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							const PxSolverBodyData& data0,
+							const PxSolverBodyData& data1,
+							const PxReal invDtF32,
+							PxReal bounceThresholdF32,
+							PxU32 frictionPerPointCount,
+							const bool hasForceThresholds,
+							const bool staticBody,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1,
+							PxReal restDist,
+							const PxReal maxCCDSeparation)
+{   
+	const FloatV ccdMaxSeparation = FLoad(maxCCDSeparation);
+	PxU8* PX_RESTRICT ptr = workspace;
+	const FloatV zero=FZero();
+
+	PxU8 flags = PxU8(hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0);
+
+	const FloatV restDistance = FLoad(restDist);
+
+	const Vec3V bodyFrame0p = V3LoadU(bodyFrame0.p);
+	const Vec3V bodyFrame1p = V3LoadU(bodyFrame1.p);
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+	
+	const PxU32 frictionPatchCount = c.frictionPatchCount;
+
+	const PxU32 pointStride = sizeof(SolverContactPoint);
+	const PxU32 frictionStride = sizeof(SolverContactFriction);
+	const PxU8 pointHeaderType = Ps::to8(staticBody ? DY_SC_TYPE_STATIC_CONTACT : DY_SC_TYPE_RB_CONTACT);
+	const PxU8 frictionHeaderType = Ps::to8(staticBody ? DY_SC_TYPE_STATIC_FRICTION : DY_SC_TYPE_FRICTION);
+
+
+	const Vec3V linVel0 = V3LoadU(data0.linearVelocity);
+	const Vec3V linVel1 = V3LoadU(data1.linearVelocity);
+	const Vec3V angVel0 = V3LoadU(data0.angularVelocity);
+	const Vec3V angVel1 = V3LoadU(data1.angularVelocity);
+
+
+	const FloatV invMass0 = FLoad(data0.invMass);
+	const FloatV invMass1 = FLoad(data1.invMass);
+
+	const FloatV maxPenBias = FMax(FLoad(data0.penBiasClamp), FLoad(data1.penBiasClamp));
+
+	// PT: the matrix is symmetric so we can read it as a PxMat33! Gets rid of 25000+ LHS.
+	const PxMat33& invIn0 = reinterpret_cast<const PxMat33&>(data0.sqrtInvInertia);
+	PX_ALIGN(16, const Mat33V invSqrtInertia0)
+	(
+		V3LoadU(invIn0.column0),
+		V3LoadU(invIn0.column1),
+		V3LoadU(invIn0.column2)
+	);
+	const PxMat33& invIn1 = reinterpret_cast<const PxMat33&>(data1.sqrtInvInertia);
+	PX_ALIGN(16, const Mat33V invSqrtInertia1)
+	(
+		V3LoadU(invIn1.column0),
+		V3LoadU(invIn1.column1),
+		V3LoadU(invIn1.column2)
+	);
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+	const FloatV orthoThreshold = FLoad(0.70710678f);
+	const FloatV eps = FLoad(0.00001f);
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+	const FloatV d0 = FLoad(invMassScale0);
+	const FloatV d1 = FLoad(invMassScale1);
+	const FloatV nDom1fV = FNeg(d1);
+	const FloatV angD0 = FLoad(invInertiaScale0);
+	const FloatV angD1 = FLoad(invInertiaScale1);
+
+	const FloatV invMass0_dom0fV = FMul(d0, invMass0);
+	const FloatV invMass1_dom1fV = FMul(nDom1fV, invMass1);
+
+
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		const Vec3V normal = Ps::aos::V3LoadA(contactBase0->normal);
+
+		const FloatV normalLenSq = V3LengthSq(normal);
+		const VecCrossV norCross = V3PrepareCross(normal);
+
+		const FloatV restitution = FLoad(contactBase0->restitution);
+
+		const FloatV norVel = V3SumElems(V3NegMulSub(normal, linVel1, V3Mul(normal, linVel0)));
+		/*const FloatV norVel0 = V3Dot(normal, linVel0);
+		const FloatV norVel1 = V3Dot(normal, linVel1);
+		const FloatV norVel = FSub(norVel0, norVel1);*/
+
+		const FloatV invMassNorLenSq0 = FMul(invMass0_dom0fV, normalLenSq);
+		const FloatV invMassNorLenSq1 = FMul(invMass1_dom1fV, normalLenSq);
+	
+		
+		SolverContactCoulombHeader* PX_RESTRICT header = reinterpret_cast<SolverContactCoulombHeader*>(ptr);
+		ptr += sizeof(SolverContactCoulombHeader);
+
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+
+		header->numNormalConstr		= PxU8(contactCount);
+		header->type				= pointHeaderType;
+		//header->setRestitution(n.restitution);
+		//header->setRestitution(contactBase0->restitution);
+		
+		header->setDominance0(invMass0_dom0fV);
+		header->setDominance1(FNeg(invMass1_dom1fV));
+		FStore(angD0, &header->angDom0);
+		FStore(angD1, &header->angDom1);
+		header->setNormal(normal);
+		header->flags = flags;
+		header->shapeInteraction = shapeInteraction;
+
+		
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer.contacts + c.contactPatches[patch].start;
+
+				
+			PxU8* p = ptr;
+			for(PxU32 j=0;j<count;j++)
+			{
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPoint* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPoint*>(p);
+				p += pointStride;
+
+				constructContactConstraint(invSqrtInertia0, invSqrtInertia1, invMassNorLenSq0, 
+					invMassNorLenSq1, angD0, angD1, bodyFrame0p, bodyFrame1p,
+					normal, norVel, norCross, angVel0, angVel1,
+					invDt, invDtp8, restDistance, maxPenBias,  restitution,
+					bounceThreshold, contact, *solverContact, ccdMaxSeparation);
+			}			
+			ptr = p;
+		}
+	}
+
+	//construct all the frictions
+
+	PxU8* PX_RESTRICT ptr2 = workspace;
+
+	bool hasFriction = false;
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		SolverContactCoulombHeader* header = reinterpret_cast<SolverContactCoulombHeader*>(ptr2); 
+		header->frictionOffset = PxU16(ptr - ptr2);// + sizeof(SolverFrictionHeader);
+		ptr2 += sizeof(SolverContactCoulombHeader) + header->numNormalConstr * pointStride;
+
+		const PxReal staticFriction = contactBase0->staticFriction;
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		const bool haveFriction = (disableStrongFriction == 0);
+	
+		SolverFrictionHeader* frictionHeader = reinterpret_cast<SolverFrictionHeader*>(ptr);
+		frictionHeader->numNormalConstr = Ps::to8(c.frictionPatchContactCounts[i]);
+		frictionHeader->numFrictionConstr = Ps::to8(haveFriction ? c.frictionPatchContactCounts[i] * frictionPerPointCount : 0);
+		ptr += sizeof(SolverFrictionHeader);
+		PxF32* appliedForceBuffer = reinterpret_cast<PxF32*>(ptr);
+		ptr += frictionHeader->getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+		PxMemZero(appliedForceBuffer, sizeof(PxF32)*contactCount*frictionPerPointCount);
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+		const Vec3V normal = V3LoadU(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);
+
+		const FloatV normalX = V3GetX(normal);
+		const FloatV normalY = V3GetY(normal);
+		const FloatV normalZ = V3GetZ(normal);
+		
+		const Vec3V t0Fallback1 = V3Merge(zero, FNeg(normalZ), normalY);
+		const Vec3V t0Fallback2 = V3Merge(FNeg(normalY), normalX, zero) ;
+
+		const BoolV con = FIsGrtr(orthoThreshold, FAbs(normalX));
+		const Vec3V tFallback1 = V3Sel(con, t0Fallback1, t0Fallback2);
+
+		const Vec3V linVrel = V3Sub(linVel0, linVel1);
+		const Vec3V t0_ = V3Sub(linVrel, V3Scale(normal, V3Dot(normal, linVrel)));
+		const FloatV sqDist = V3Dot(t0_,t0_);
+		const BoolV con1 = FIsGrtr(sqDist, eps);
+		const Vec3V tDir0 =V3Normalize(V3Sel(con1, t0_, tFallback1));
+		const Vec3V tDir1 = V3Cross(tDir0, normal);
+
+		Vec3V tFallback = tDir0;
+		Vec3V tFallbackAlt = tDir1;
+
+		if(haveFriction)
+		{
+			//frictionHeader->setStaticFriction(n.staticFriction);
+			frictionHeader->setStaticFriction(staticFriction);
+			FStore(invMass0_dom0fV, &frictionHeader->invMass0D0);
+			FStore(FNeg(invMass1_dom1fV), &frictionHeader->invMass1D1);
+			FStore(angD0, &frictionHeader->angDom0);
+			FStore(angD1, &frictionHeader->angDom1);
+			frictionHeader->type			= frictionHeaderType;
+			
+			PxU32 totalPatchContactCount = 0;
+		
+			for(PxU32 patch=c.correlationListHeads[i]; 
+				patch!=CorrelationBuffer::LIST_END; 
+				patch = c.contactPatches[patch].next)
+			{
+				const PxU32 count = c.contactPatches[patch].count;
+				const PxU32 start = c.contactPatches[patch].start;
+				const Gu::ContactPoint* contactBase = buffer.contacts + start;
+					
+				PxU8* p = ptr;
+				for(PxU32 j =0; j < count; j++)
+				{
+					hasFriction = true;
+					const Gu::ContactPoint& contact = contactBase[j];
+					const Vec3V point = V3LoadU(contact.point);
+					const Vec3V ra = V3Sub(point, bodyFrame0p);
+					const Vec3V rb = V3Sub(point, bodyFrame1p);
+					const Vec3V targetVel = V3LoadU(contact.targetVel);
+
+					for(PxU32 k = 0; k < frictionPerPointCount; ++k)
+					{
+						const Vec3V t0 = tFallback;
+						tFallback = tFallbackAlt;
+						tFallbackAlt = t0;
+
+						SolverContactFriction* PX_RESTRICT f0 = reinterpret_cast<SolverContactFriction*>(p);
+						p += frictionStride;
+						//f0->brokenOrContactIndex = contactId;
+
+						const Vec3V raXn = V3Cross(ra, t0);
+						const Vec3V rbXn = V3Cross(rb, t0);
+
+						const Vec3V delAngVel0 = M33MulV3(invSqrtInertia0, raXn);
+						const Vec3V delAngVel1 = M33MulV3(invSqrtInertia1, rbXn);
+
+						const FloatV resp0 = FAdd(invMass0_dom0fV, FMul(angD0, V3Dot(delAngVel0, delAngVel0)));
+						const FloatV resp1 = FSub(FMul(angD1, V3Dot(delAngVel1, delAngVel1)), invMass1_dom1fV);
+						const FloatV resp = FAdd(resp0, resp1);
+
+						const FloatV velMultiplier = FNeg(FSel(FIsGrtr(resp, zero), FRecip(resp), zero));
+
+						const FloatV vrel1 = FAdd(V3Dot(t0, linVel0), V3Dot(raXn, angVel0));
+						const FloatV vrel2 = FAdd(V3Dot(t0, linVel1), V3Dot(rbXn, angVel1));
+						const FloatV vrel = FSub(vrel1, vrel2);
+
+
+						f0->normalXYZ_appliedForceW = V4SetW(Vec4V_From_Vec3V(t0), zero);
+						f0->raXnXYZ_velMultiplierW = V4SetW(Vec4V_From_Vec3V(delAngVel0), velMultiplier);
+						//f0->rbXnXYZ_targetVelocityW = V4SetW(Vec4V_From_Vec3V(delAngVel1), FSub(V3Dot(targetVel, t0), vrel));
+						f0->rbXnXYZ_biasW = Vec4V_From_Vec3V(delAngVel1);
+						FStore(FSub(V3Dot(targetVel, t0), vrel), &f0->targetVel);
+					}
+				}
+
+				totalPatchContactCount += c.contactPatches[patch].count;
+				
+				ptr = p;	
+			}
+		}
+	}
+	*ptr = 0;
+	return hasFriction;
+}
+
+
+
+static void computeBlockStreamByteSizesCoulomb(const CorrelationBuffer& c,
+													 const PxU32 frictionCountPerPoint, PxU32& _solverConstraintByteSize,
+													 PxU32& _axisConstraintCount,
+													 bool useExtContacts)
+{
+	PX_ASSERT(0 == _solverConstraintByteSize);
+	PX_ASSERT(0 == _axisConstraintCount);
+
+	// PT: use local vars to remove LHS
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 numFrictionPatches = 0;
+	PxU32 axisConstraintCount = 0;
+
+	for(PxU32 i = 0; i < c.frictionPatchCount; i++)
+	{
+		//Friction patches.
+		if(c.correlationListHeads[i] != CorrelationBuffer::LIST_END)
+			numFrictionPatches++;
+
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+		const bool haveFriction = (frictionPatch.materialFlags & PxMaterialFlag::eDISABLE_FRICTION) == 0;
+
+		//Solver constraint data.
+		if(c.frictionPatchContactCounts[i]!=0)
+		{
+			solverConstraintByteSize += sizeof(SolverContactCoulombHeader);
+			
+			solverConstraintByteSize += useExtContacts ? c.frictionPatchContactCounts[i] * sizeof(SolverContactPointExt) 
+				: c.frictionPatchContactCounts[i] * sizeof(SolverContactPoint);
+
+			axisConstraintCount += c.frictionPatchContactCounts[i];
+
+			//We always need the friction headers to write the accumulated 
+			if(haveFriction)
+			{
+				//4 bytes
+				solverConstraintByteSize += sizeof(SolverFrictionHeader);
+				//buffer to store applied forces in
+				solverConstraintByteSize += SolverFrictionHeader::getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+
+				const PxU32 nbFrictionConstraints = c.frictionPatchContactCounts[i] * frictionCountPerPoint;
+
+				solverConstraintByteSize += useExtContacts ? nbFrictionConstraints * sizeof(SolverContactFrictionExt)
+					: nbFrictionConstraints * sizeof(SolverContactFriction);
+				axisConstraintCount += c.frictionPatchContactCounts[i];
+			}
+			else
+			{
+				//reserve buffers for storing accumulated impulses
+				solverConstraintByteSize += sizeof(SolverFrictionHeader);
+				solverConstraintByteSize += SolverFrictionHeader::getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+			}
+		}
+	}  
+	_axisConstraintCount = axisConstraintCount;
+
+	//16-byte alignment.
+	_solverConstraintByteSize =  ((solverConstraintByteSize + 0x0f) & ~0x0f);
+	PX_ASSERT(0 == (_solverConstraintByteSize & 0x0f));
+}
+
+static bool reserveBlockStreamsCoulomb(const CorrelationBuffer& c,
+						PxU8*& solverConstraint, PxU32 frictionCountPerPoint,
+						PxU32& solverConstraintByteSize,
+						PxU32& axisConstraintCount, PxConstraintAllocator& constraintAllocator,
+						bool useExtContacts)
+{
+	PX_ASSERT(NULL == solverConstraint);
+	PX_ASSERT(0 == solverConstraintByteSize);
+	PX_ASSERT(0 == axisConstraintCount);
+	
+
+	//From constraintBlockStream we need to reserve contact points, contact forces, and a char buffer for the solver constraint data (already have a variable for this).
+	//From frictionPatchStream we just need to reserve a single buffer.
+
+	//Compute the sizes of all the buffers.
+	computeBlockStreamByteSizesCoulomb(
+		c,
+		frictionCountPerPoint, solverConstraintByteSize,
+		axisConstraintCount, useExtContacts);
+
+	//Reserve the buffers.
+
+	//First reserve the accumulated buffer size for the constraint block.
+	PxU8* constraintBlock = NULL;
+	const PxU32 constraintBlockByteSize = solverConstraintByteSize;
+	if(constraintBlockByteSize > 0)
+	{
+		constraintBlock = constraintAllocator.reserveConstraintData(constraintBlockByteSize + 16u);
+
+		if(0==constraintBlock || (reinterpret_cast<PxU8*>(-1))==constraintBlock)
+		{
+			if(0==constraintBlock)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of contact data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				constraintBlock=NULL;
+			}
+		}
+	}
+
+	//Patch up the individual ptrs to the buffer returned by the constraint block reservation (assuming the reservation didn't fail).
+	if(0==constraintBlockByteSize || constraintBlock)
+	{
+		if(solverConstraintByteSize)
+		{
+			solverConstraint = constraintBlock;
+			PX_ASSERT(0==(uintptr_t(solverConstraint) & 0x0f));
+		}
+	}
+
+	//Return true if neither of the two block reservations failed.
+	return ((0==constraintBlockByteSize || constraintBlock));
+}
+
+bool createFinalizeSolverContactsCoulomb1D(PxSolverContactDesc& contactDesc,
+	PxsContactManagerOutput& output,
+	ThreadContext& threadContext,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxConstraintAllocator& constraintAllocator)
+{
+	return createFinalizeSolverContactsCoulomb(contactDesc, output, threadContext, invDtF32, bounceThresholdF32, frictionOffsetThreshold, correlationDistance, constraintAllocator, PxFrictionType::eONE_DIRECTIONAL);
+}
+
+bool createFinalizeSolverContactsCoulomb2D(PxSolverContactDesc& contactDesc,
+	PxsContactManagerOutput& output,
+	ThreadContext& threadContext,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxConstraintAllocator& constraintAllocator)
+
+{
+	return createFinalizeSolverContactsCoulomb(contactDesc, output, threadContext, invDtF32, bounceThresholdF32, frictionOffsetThreshold, correlationDistance, constraintAllocator, PxFrictionType::eTWO_DIRECTIONAL);
+}
+
+bool createFinalizeSolverContactsCoulomb(PxSolverContactDesc& contactDesc,
+									PxsContactManagerOutput& output,
+								 ThreadContext& threadContext,
+								 const PxReal invDtF32,
+								 PxReal bounceThresholdF32,
+								 PxReal frictionOffsetThreshold,
+								 PxReal correlationDistance,
+								 PxConstraintAllocator& constraintAllocator,
+								 PxFrictionType::Enum frictionType)
+{
+	PX_UNUSED(frictionOffsetThreshold);
+	PX_UNUSED(correlationDistance);
+
+	PxSolverConstraintDesc& desc = *contactDesc.desc;
+
+	desc.constraintLengthOver16 = 0;
+	
+	ContactBuffer& buffer = threadContext.mContactBuffer;
+
+	buffer.count = 0;
+
+	// We pull the friction patches out of the cache to remove the dependency on how
+	// the cache is organized. Remember original addrs so we can write them back 
+	// efficiently.
+
+	Ps::prefetchLine(contactDesc.frictionPtr);
+
+	PxReal invMassScale0 = 1.f;
+	PxReal invMassScale1 = 1.f;
+	PxReal invInertiaScale0 = 1.f;
+	PxReal invInertiaScale1 = 1.f;
+
+	bool hasMaxImpulse = false, hasTargetVelocity = false;
+	
+	PxU32 numContacts = extractContacts(buffer, output, hasMaxImpulse, hasTargetVelocity, invMassScale0, invMassScale1, 
+			invInertiaScale0, invInertiaScale1, PxMin(contactDesc.data0->maxContactImpulse, contactDesc.data1->maxContactImpulse));
+
+	if(numContacts == 0)
+	{
+		contactDesc.frictionPtr = NULL;
+		contactDesc.frictionCount = 0;
+		return true;
+	}
+
+	Ps::prefetchLine(contactDesc.body0);
+	Ps::prefetchLine(contactDesc.body1);
+	Ps::prefetchLine(contactDesc.data0);
+	Ps::prefetchLine(contactDesc.data1);
+
+	CorrelationBuffer& c = threadContext.mCorrelationBuffer;
+	c.frictionPatchCount = 0;
+	c.contactPatchCount = 0;
+
+	createContactPatches(c, buffer.contacts, buffer.count, PXC_SAME_NORMAL);	
+
+	PxU32 numFrictionPerPatch = PxU32(frictionType == PxFrictionType::eONE_DIRECTIONAL ? 1 : 2);
+	
+	bool overflow = correlatePatches(c, buffer.contacts, contactDesc.bodyFrame0, contactDesc.bodyFrame1, PXC_SAME_NORMAL, 0, 0);
+	PX_UNUSED(overflow);
+#if PX_CHECKED
+	if(overflow)
+	{
+		Ps::getFoundation().error(physx::PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, 
+					"Dropping contacts in solver because we exceeded limit of 32 friction patches.");
+	}
+#endif
+
+
+	//PX_ASSERT(patchCount == c.frictionPatchCount);
+
+	PxU8* solverConstraint = NULL;
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 axisConstraintCount = 0;
+
+	bool useExtContacts = !!((contactDesc.bodyState0 | contactDesc.bodyState1) & PxSolverContactDesc::eARTICULATION);
+
+	const bool successfulReserve = reserveBlockStreamsCoulomb(
+		c,
+		solverConstraint, numFrictionPerPatch,
+		solverConstraintByteSize,
+		axisConstraintCount,
+		constraintAllocator,
+		useExtContacts);
+
+	// initialise the work unit's ptrs to the various buffers.
+
+	contactDesc.frictionPtr = NULL;
+	desc.constraint = NULL;
+	desc.constraintLengthOver16 = 0;
+	contactDesc.frictionCount = 0;
+	
+	// patch up the work unit with the reserved buffers and set the reserved buffer data as appropriate.
+
+	if(successfulReserve)
+	{
+		desc.constraint = solverConstraint;
+		output.nbContacts = Ps::to8(numContacts);
+		desc.constraintLengthOver16 = Ps::to16(solverConstraintByteSize/16);
+
+		//Initialise solverConstraint buffer.
+		if(solverConstraint)
+		{
+			bool hasFriction = false;
+			if(useExtContacts)
+			{
+				const PxSolverBodyData& data0 = *contactDesc.data0;
+				const PxSolverBodyData& data1 = *contactDesc.data1;
+
+				const SolverExtBody b0(reinterpret_cast<const void*>(contactDesc.body0), reinterpret_cast<const void*>(&data0), desc.linkIndexA);
+				const SolverExtBody b1(reinterpret_cast<const void*>(contactDesc.body1), reinterpret_cast<const void*>(&data1), desc.linkIndexB);
+
+				hasFriction = setupFinalizeExtSolverContactsCoulomb(buffer, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					invDtF32, bounceThresholdF32, b0, b1, numFrictionPerPatch,
+					invMassScale0, invInertiaScale0, invMassScale1, invInertiaScale1, contactDesc.restDistance, contactDesc.maxCCDSeparation);
+			}
+			else
+			{
+				const PxSolverBodyData& data0 = *contactDesc.data0;
+				const PxSolverBodyData& data1 = *contactDesc.data1;
+
+				hasFriction = setupFinalizeSolverConstraintsCoulomb(contactDesc.shapeInteraction, buffer, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					data0, data1, invDtF32, bounceThresholdF32, numFrictionPerPatch, contactDesc.hasForceThresholds, contactDesc.bodyState1 == PxSolverContactDesc::eSTATIC_BODY,
+					invMassScale0, invInertiaScale0, invMassScale1, invInertiaScale1, contactDesc.restDistance, contactDesc.maxCCDSeparation);
+			}
+			*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize)) = 0;
+			*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize + 4)) = hasFriction ? 0xFFFFFFFF : 0;
+		}
+	}
+
+	return successfulReserve;
+}
+
+}
+}
+
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrepShared.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrepShared.h
new file mode 100644
index 00000000..7accabd3
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactPrepShared.h
@@ -0,0 +1,301 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_CONTACT_PREP_SHARED_H
+#define DY_CONTACT_PREP_SHARED_H
+     
+#include "foundation/PxPreprocessor.h"
+#include "PxSceneDesc.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DyContactPrep.h"
+#include "DyCorrelationBuffer.h"
+#include "DyArticulationContactPrep.h"
+#include "PxsContactManager.h"
+#include "PxsContactManagerState.h"
+
+namespace physx
+{
+namespace Dy
+{
+
+
+PX_FORCE_INLINE bool pointsAreClose(const PxTransform& body1ToBody0,
+									const PxVec3& localAnchor0, const PxVec3& localAnchor1,
+									const PxVec3& axis, float correlDist)
+{
+	const PxVec3 body0PatchPoint1 = body1ToBody0.transform(localAnchor1);
+
+	return PxAbs((localAnchor0 - body0PatchPoint1).dot(axis))<correlDist;
+}
+
+PX_FORCE_INLINE bool isSeparated(const FrictionPatch& patch, const PxTransform& body1ToBody0, const PxReal correlationDistance)
+{
+	PX_ASSERT(patch.anchorCount <= 2);
+	for(PxU32 a = 0; a < patch.anchorCount; ++a)
+	{
+		if(!pointsAreClose(body1ToBody0, patch.body0Anchors[a], patch.body1Anchors[a], patch.body0Normal, correlationDistance))
+			return true;
+	}
+	return false;
+}
+
+
+inline bool getFrictionPatches(CorrelationBuffer& c,
+						const PxU8* frictionCookie,
+						PxU32 frictionPatchCount,
+						const PxTransform& bodyFrame0,
+						const PxTransform& bodyFrame1,
+						PxReal correlationDistance)
+{
+	PX_UNUSED(correlationDistance);
+	if(frictionCookie == NULL || frictionPatchCount == 0)
+		return true;
+
+	//KS - this is now DMA'd inside the shader so we don't need to immediate DMA it here
+	const FrictionPatch* patches = reinterpret_cast<const FrictionPatch*>(frictionCookie);
+
+	//Try working out relative transforms! TODO - can we compute this lazily for the first friction patch
+	bool evaluated = false;
+	PxTransform body1ToBody0;
+
+	while(frictionPatchCount--)
+	{
+		Ps::prefetchLine(patches,128);
+		const FrictionPatch& patch = *patches++;
+		PX_ASSERT (patch.broken == 0 || patch.broken == 1);
+		if(!patch.broken)
+		{
+			// if the eDISABLE_STRONG_FRICTION flag is there we need to blow away the previous frame's friction correlation, so
+			// that we can associate each friction anchor with a target velocity. So we lose strong friction.
+			if(patch.anchorCount != 0 && !(patch.materialFlags & PxMaterialFlag::eDISABLE_STRONG_FRICTION))
+			{
+				PX_ASSERT(patch.anchorCount <= 2);
+
+				
+				if(!evaluated)
+				{
+					body1ToBody0 = bodyFrame0.transformInv(bodyFrame1);
+					evaluated = true;
+				}
+
+
+				if(patch.body0Normal.dot(body1ToBody0.rotate(patch.body1Normal)) > PXC_SAME_NORMAL)
+				{
+					if(!isSeparated(patch, body1ToBody0, correlationDistance))
+					{
+						if(c.frictionPatchCount == CorrelationBuffer::MAX_FRICTION_PATCHES)
+							return false;
+						{
+							c.contactID[c.frictionPatchCount][0] = 0xffff;
+							c.contactID[c.frictionPatchCount][1] = 0xffff;
+							//Rotate the contact normal into world space
+							c.frictionPatchWorldNormal[c.frictionPatchCount] = bodyFrame0.rotate(patch.body0Normal);
+							c.frictionPatchContactCounts[c.frictionPatchCount] = 0;
+							c.correlationListHeads[c.frictionPatchCount] = CorrelationBuffer::LIST_END;
+							PxMemCopy(&c.frictionPatches[c.frictionPatchCount++], &patch, sizeof(FrictionPatch));
+						}
+					}
+				}
+			}
+		}
+	}
+	return true;
+}
+
+PX_FORCE_INLINE PxU32 extractContacts(Gu::ContactBuffer& buffer, PxsContactManagerOutput& npOutput, bool& hasMaxImpulse, bool& hasTargetVelocity,
+							 PxReal& invMassScale0, PxReal& invMassScale1, PxReal& invInertiaScale0, PxReal& invInertiaScale1, PxReal defaultMaxImpulse)
+{
+	PxContactStreamIterator iter(npOutput.contactPatches, npOutput.contactPoints, npOutput.getInternalFaceIndice(), npOutput.nbPatches, npOutput.nbContacts);	
+
+	PxU32 numContacts = buffer.count, origContactCount = buffer.count;
+	if(!iter.forceNoResponse)
+	{
+		invMassScale0 = iter.getInvMassScale0();
+		invMassScale1 = iter.getInvMassScale1();
+		invInertiaScale0 = iter.getInvInertiaScale0();
+		invInertiaScale1 = iter.getInvInertiaScale1();
+		hasMaxImpulse = (iter.patch->internalFlags & PxContactPatch::eHAS_MAX_IMPULSE) != 0;
+		hasTargetVelocity = (iter.patch->internalFlags & PxContactPatch::eHAS_TARGET_VELOCITY) != 0;
+
+		while(iter.hasNextPatch())
+		{
+			iter.nextPatch();
+			while(iter.hasNextContact())
+			{
+				iter.nextContact();
+				Ps::prefetchLine(iter.contact, 128);
+				Ps::prefetchLine(&buffer.contacts[numContacts], 128);
+				PxReal maxImpulse = hasMaxImpulse ? iter.getMaxImpulse() : defaultMaxImpulse;
+				if(maxImpulse != 0.f)
+				{
+					PX_ASSERT(numContacts < Gu::ContactBuffer::MAX_CONTACTS);
+					buffer.contacts[numContacts].normal = iter.getContactNormal();
+					buffer.contacts[numContacts].point = iter.getContactPoint();
+					buffer.contacts[numContacts].separation = iter.getSeparation();
+					//KS - we use the face indices to cache the material indices and flags - avoids bloating the PxContact structure
+					buffer.contacts[numContacts].materialFlags = PxU8(iter.getMaterialFlags());
+					buffer.contacts[numContacts].maxImpulse = maxImpulse;
+					buffer.contacts[numContacts].staticFriction = iter.getStaticFriction();
+					buffer.contacts[numContacts].dynamicFriction = iter.getDynamicFriction();
+					buffer.contacts[numContacts].restitution = iter.getRestitution();
+					const PxVec3& targetVel = iter.getTargetVel();
+					buffer.contacts[numContacts].targetVel = targetVel;
+					++numContacts;
+				}
+			}
+		}
+	}
+	const PxU32 contactCount = numContacts - origContactCount;
+	buffer.count = numContacts;
+	return contactCount;
+}
+
+struct CorrelationListIterator
+{
+	CorrelationBuffer& buffer;
+	PxU32 currPatch;
+	PxU32 currContact;
+
+	CorrelationListIterator(CorrelationBuffer& correlationBuffer, PxU32 startPatch) : buffer(correlationBuffer)
+	{
+		//We need to force us to advance the correlation buffer to the first available contact (if one exists)
+		PxU32 newPatch = startPatch, newContact = 0;
+
+		while(newPatch != CorrelationBuffer::LIST_END && newContact == buffer.contactPatches[newPatch].count)
+		{
+			newPatch = buffer.contactPatches[newPatch].next;
+			newContact = 0;
+		}
+
+		currPatch = newPatch;
+		currContact = newContact;
+	}
+
+	//Returns true if it has another contact pre-loaded. Returns false otherwise
+	PX_FORCE_INLINE bool hasNextContact()
+	{
+		return (currPatch != CorrelationBuffer::LIST_END && currContact < buffer.contactPatches[currPatch].count);
+	}
+
+	inline void nextContact(PxU32& patch, PxU32& contact)
+	{
+		PX_ASSERT(currPatch != CorrelationBuffer::LIST_END);
+		PX_ASSERT(currContact < buffer.contactPatches[currPatch].count);
+
+		patch = currPatch;
+		contact = currContact;
+		PxU32 newPatch = currPatch, newContact = currContact + 1;
+
+		while(newPatch != CorrelationBuffer::LIST_END && newContact == buffer.contactPatches[newPatch].count)
+		{
+			newPatch = buffer.contactPatches[newPatch].next;
+			newContact = 0;
+		}
+
+		currPatch = newPatch;
+		currContact = newContact;
+	}
+
+private:
+	CorrelationListIterator& operator=(const CorrelationListIterator&);
+
+};
+
+
+	PX_FORCE_INLINE void constructContactConstraint(const Mat33V& invSqrtInertia0, const Mat33V& invSqrtInertia1,  const FloatVArg invMassNorLenSq0, 
+		const FloatVArg invMassNorLenSq1, const FloatVArg angD0, const FloatVArg angD1, const Vec3VArg bodyFrame0p, const Vec3VArg bodyFrame1p,
+		const Vec3VArg normal, const FloatVArg norVel, const VecCrossV& norCross, const Vec3VArg angVel0, const Vec3VArg angVel1,
+		const FloatVArg invDt, const FloatVArg invDtp8, const FloatVArg restDistance, const FloatVArg maxPenBias,  const FloatVArg restitution,
+		const FloatVArg bounceThreshold, const Gu::ContactPoint& contact, SolverContactPoint& solverContact,
+		const FloatVArg ccdMaxSeparation)
+	{
+		const FloatV zero = FZero();
+		const Vec3V point = V3LoadA(contact.point);
+		const FloatV separation = FLoad(contact.separation);
+
+		const FloatV cTargetVel = V3Dot(normal, V3LoadA(contact.targetVel));
+
+		const Vec3V ra = V3Sub(point, bodyFrame0p);
+		const Vec3V rb = V3Sub(point, bodyFrame1p);
+
+		const Vec3V raXn = V3Cross(ra, norCross);
+		const Vec3V rbXn = V3Cross(rb, norCross);
+
+		const Vec3V raXnSqrtInertia = M33MulV3(invSqrtInertia0, raXn);
+		const Vec3V rbXnSqrtInertia = M33MulV3(invSqrtInertia1, rbXn);				
+
+		const FloatV resp0 = FAdd(invMassNorLenSq0, FMul(V3Dot(raXnSqrtInertia, raXnSqrtInertia), angD0));
+		const FloatV resp1 = FSub(FMul(V3Dot(rbXnSqrtInertia, rbXnSqrtInertia), angD1), invMassNorLenSq1);
+
+		const FloatV unitResponse = FAdd(resp0, resp1);
+
+		const FloatV vrel1 = FAdd(norVel, V3Dot(raXn, angVel0));
+		const FloatV vrel2 = V3Dot(rbXn, angVel1);
+		const FloatV vrel = FSub(vrel1, vrel2);
+
+		const FloatV velMultiplier = FSel(FIsGrtr(unitResponse, zero), FRecip(unitResponse), zero);
+
+		const FloatV penetration = FSub(separation, restDistance);
+
+		const FloatV penetrationInvDt = FMul(penetration, invDt);
+
+		const FloatV penetrationInvDtPt8 = FMax(maxPenBias, FMul(penetration, invDtp8));
+
+		FloatV scaledBias = FMul(velMultiplier, penetrationInvDtPt8);
+
+		const BoolV isGreater2 = BAnd(BAnd(FIsGrtr(restitution, zero), FIsGrtr(bounceThreshold, vrel)), FIsGrtr(FNeg(vrel), penetrationInvDt));
+
+		const BoolV ccdSeparationCondition = FIsGrtrOrEq(ccdMaxSeparation, penetration);
+
+		scaledBias = FSel(BAnd(ccdSeparationCondition, isGreater2), zero, scaledBias);
+
+		const FloatV sumVRel(vrel);
+
+		FloatV targetVelocity = FAdd(cTargetVel, FSel(isGreater2, FMul(FNeg(sumVRel), restitution), zero));
+
+		//Note - we add on the initial target velocity
+		targetVelocity = FSub(targetVelocity, vrel);
+
+		const FloatV biasedErr = FScaleAdd(targetVelocity, velMultiplier, FNeg(scaledBias));
+		const FloatV unbiasedErr = FScaleAdd(targetVelocity, velMultiplier, FSel(isGreater2, zero, FNeg(FMax(scaledBias, zero))));
+		//const FloatV unbiasedErr = FScaleAdd(targetVelocity, velMultiplier, FNeg(FMax(scaledBias, zero)));
+
+		FStore(velMultiplier, &solverContact.velMultiplier);
+		FStore(biasedErr, &solverContact.biasedErr);
+		FStore(unbiasedErr, &solverContact.unbiasedErr);
+		solverContact.maxImpulse = contact.maxImpulse;
+
+		solverContact.raXn = raXnSqrtInertia;
+		solverContact.rbXn = rbXnSqrtInertia;
+	}
+}
+}
+
+#endif //DY_CONTACT_PREP_SHARED_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyContactReduction.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactReduction.h
new file mode 100644
index 00000000..a02fe8e9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyContactReduction.h
@@ -0,0 +1,409 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_CONTACT_REDUCTION_H
+#define DY_CONTACT_REDUCTION_H
+
+#include "GuContactPoint.h"
+#include "PxsMaterialManager.h"
+
+namespace physx
+{
+
+
+namespace Dy
+{
+
+//KS - might be OK with 4 but 5 guarantees the deepest + 4 contacts that contribute to largest surface area
+#define CONTACT_REDUCTION_MAX_CONTACTS 6
+#define CONTACT_REDUCTION_MAX_PATCHES 32
+#define PXS_NORMAL_TOLERANCE 0.995f
+#define PXS_SEPARATION_TOLERANCE 0.001f
+
+
+	//A patch contains a normal, pair of material indices and a list of indices. These indices are 
+	//used to index into the PxContact array that's passed by the user
+	struct ReducedContactPatch
+	{
+		PxU32 numContactPoints;
+		PxU32 contactPoints[CONTACT_REDUCTION_MAX_CONTACTS];	
+	};
+
+	struct ContactPatch
+	{	
+		PxVec3 rootNormal;
+		ContactPatch* mNextPatch;
+		PxReal maxPenetration;
+		PxU16 startIndex;
+		PxU16 stride;
+		PxU16 rootIndex;
+		PxU16 index;
+	};
+
+	struct SortBoundsPredicateManifold
+	{
+		bool operator()(const ContactPatch* idx1, const ContactPatch* idx2) const
+		{
+			return idx1->maxPenetration < idx2->maxPenetration;
+		}
+	};
+
+
+
+	template <PxU32 MaxPatches>
+	class ContactReduction
+	{
+	public:
+		ReducedContactPatch mPatches[MaxPatches];
+		PxU32 mNumPatches;
+		ContactPatch mIntermediatePatches[CONTACT_REDUCTION_MAX_PATCHES];
+		ContactPatch* mIntermediatePatchesPtrs[CONTACT_REDUCTION_MAX_PATCHES];
+		PxU32 mNumIntermediatePatches;
+		Gu::ContactPoint* PX_RESTRICT mOriginalContacts;
+		PxsMaterialInfo* PX_RESTRICT mMaterialInfo;
+		PxU32 mNumOriginalContacts;
+
+		ContactReduction(Gu::ContactPoint* PX_RESTRICT originalContacts, PxsMaterialInfo* PX_RESTRICT materialInfo, PxU32 numContacts) : 
+		mNumPatches(0), mNumIntermediatePatches(0),	mOriginalContacts(originalContacts), mMaterialInfo(materialInfo), mNumOriginalContacts(numContacts)
+		{
+		}
+
+		void reduceContacts()
+		{
+			//First pass, break up into contact patches, storing the start and stride of the patches
+			//We will need to have contact patches and then coallesce them
+			mIntermediatePatches[0].rootNormal = mOriginalContacts[0].normal;
+			mIntermediatePatches[0].mNextPatch = NULL;
+			mIntermediatePatches[0].startIndex = 0;
+			mIntermediatePatches[0].rootIndex = 0;
+			mIntermediatePatches[0].maxPenetration = mOriginalContacts[0].separation;
+			mIntermediatePatches[0].index = 0;
+			PxU16 numPatches = 1;
+			//PxU32 startIndex = 0;
+			PxU32 numUniquePatches = 1;
+			PxU16 m = 1;
+			for(; m < mNumOriginalContacts; ++m)
+			{
+				PxI32 index = -1;
+				for(PxU32 b = numPatches; b > 0; --b)
+				{
+					ContactPatch& patch = mIntermediatePatches[b-1];
+					if(mMaterialInfo[patch.startIndex].mMaterialIndex0 == mMaterialInfo[m].mMaterialIndex0 && mMaterialInfo[patch.startIndex].mMaterialIndex1 == mMaterialInfo[m].mMaterialIndex1 && 
+						patch.rootNormal.dot(mOriginalContacts[m].normal) >= PXS_NORMAL_TOLERANCE)
+					{
+						index = PxI32(b-1);
+						break;
+					}
+				}
+
+				if(index != numPatches - 1)
+				{
+					mIntermediatePatches[numPatches-1].stride = PxU16(m - mIntermediatePatches[numPatches - 1].startIndex);
+					//Create a new patch...
+					if(numPatches == CONTACT_REDUCTION_MAX_PATCHES)
+					{
+						break;
+					}
+					mIntermediatePatches[numPatches].startIndex = m;
+					mIntermediatePatches[numPatches].mNextPatch = NULL;
+					if(index == -1)
+					{
+						mIntermediatePatches[numPatches].rootIndex = numPatches;
+						mIntermediatePatches[numPatches].rootNormal = mOriginalContacts[m].normal;
+						mIntermediatePatches[numPatches].maxPenetration = mOriginalContacts[m].separation;
+						mIntermediatePatches[numPatches].index = numPatches;
+						++numUniquePatches;
+					}
+					else
+					{
+						//Find last element in the link
+						PxU16 rootIndex = mIntermediatePatches[index].rootIndex;
+						mIntermediatePatches[index].mNextPatch = &mIntermediatePatches[numPatches];
+						mIntermediatePatches[numPatches].rootNormal = mIntermediatePatches[index].rootNormal;
+						mIntermediatePatches[rootIndex].maxPenetration = mIntermediatePatches[numPatches].maxPenetration = PxMin(mIntermediatePatches[rootIndex].maxPenetration, mOriginalContacts[m].separation);
+						mIntermediatePatches[numPatches].rootIndex = rootIndex;
+						mIntermediatePatches[numPatches].index = numPatches;
+					}
+					++numPatches;
+				}
+			}
+			mIntermediatePatches[numPatches-1].stride = PxU16(m - mIntermediatePatches[numPatches-1].startIndex);
+
+			//OK, we have a list of contact patches so that we can start contact reduction per-patch
+
+			//OK, now we can go and reduce the contacts on a per-patch basis...
+
+			for(PxU32 a = 0; a < numPatches; ++a)
+			{
+				mIntermediatePatchesPtrs[a] = &mIntermediatePatches[a];
+			}
+
+
+			SortBoundsPredicateManifold predicate;
+			Ps::sort(mIntermediatePatchesPtrs, numPatches, predicate);
+
+			PxU32 numReducedPatches = 0;
+			for(PxU32 a = 0; a < numPatches; ++a)
+			{
+				if(mIntermediatePatchesPtrs[a]->rootIndex == mIntermediatePatchesPtrs[a]->index)
+				{
+					//Reduce this patch...
+					if(numReducedPatches == MaxPatches)
+						break;
+
+					ReducedContactPatch& reducedPatch = mPatches[numReducedPatches++];
+					//OK, now we need to work out if we have to reduce patches...
+					PxU32 contactCount = 0;
+					{
+						ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+
+						while(tmpPatch)
+						{
+							contactCount += tmpPatch->stride;
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+					}
+
+					if(contactCount <= CONTACT_REDUCTION_MAX_CONTACTS)
+					{
+						//Just add the contacts...
+						ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+
+						PxU32 ind = 0;
+						while(tmpPatch)
+						{
+							for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+							{
+								reducedPatch.contactPoints[ind++] = tmpPatch->startIndex + b;
+							}
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+						reducedPatch.numContactPoints = contactCount;
+					}
+					else
+					{
+						//Iterate through and find the most extreme point
+						
+
+						PxU32 ind = 0;
+
+						{
+							PxReal dist = 0.f;
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									PxReal magSq = mOriginalContacts[tmpPatch->startIndex + b].point.magnitudeSquared();
+									if(dist < magSq)
+									{
+										ind = tmpPatch->startIndex + b;
+										dist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}	
+						reducedPatch.contactPoints[0] = ind;
+						const PxVec3 p0 = mOriginalContacts[ind].point;
+
+						//Now find the point farthest from this point...						
+						{
+							PxReal maxDist = 0.f;
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									PxReal magSq = (p0 - mOriginalContacts[tmpPatch->startIndex + b].point).magnitudeSquared();
+									if(magSq > maxDist)
+									{
+										ind = tmpPatch->startIndex + b;
+										maxDist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}
+						reducedPatch.contactPoints[1] = ind;
+						const PxVec3 p1 = mOriginalContacts[ind].point;
+
+						//Now find the point farthest from the segment
+
+						PxVec3 n = (p0 - p1).cross(mIntermediatePatchesPtrs[a]->rootNormal);
+
+						//PxReal tVal = 0.f;
+						{
+							PxReal maxDist = 0.f;
+							//PxReal tmpTVal;
+							
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									
+									//PxReal magSq = tmpDistancePointSegmentSquared(p0, p1, mOriginalContacts[tmpPatch->startIndex + b].point, tmpTVal);
+									PxReal magSq = (mOriginalContacts[tmpPatch->startIndex + b].point - p0).dot(n);
+									if(magSq > maxDist)
+									{
+										ind = tmpPatch->startIndex + b;
+										//tVal = tmpTVal;
+										maxDist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}
+						reducedPatch.contactPoints[2] = ind;
+
+						//const PxVec3 closest = (p0 + (p1 - p0) * tVal);
+
+						const PxVec3 dir = -n;//closest - p3;
+
+						{
+							PxReal maxDist = 0.f;
+							//PxReal tVal = 0.f;
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									PxReal magSq =  (mOriginalContacts[tmpPatch->startIndex + b].point - p0).dot(dir);
+									if(magSq > maxDist)
+									{
+										ind = tmpPatch->startIndex + b;
+										maxDist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}
+						reducedPatch.contactPoints[3] = ind;
+
+						//Now, we iterate through all the points, and cluster the points. From this, we establish the deepest point that's within a 
+						//tolerance of this point and keep that point
+
+						PxReal separation[CONTACT_REDUCTION_MAX_CONTACTS];
+						PxU32 deepestInd[CONTACT_REDUCTION_MAX_CONTACTS];
+						for(PxU32 i = 0; i < 4; ++i)
+						{
+							PxU32 index = reducedPatch.contactPoints[i];
+							separation[i] = mOriginalContacts[index].separation - PXS_SEPARATION_TOLERANCE;
+							deepestInd[i] = index;
+						}
+
+						ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+						while(tmpPatch)
+						{
+							for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+							{
+								Gu::ContactPoint& point = mOriginalContacts[tmpPatch->startIndex + b];
+								
+								PxReal distance = PX_MAX_REAL;
+								PxU32 index = 0;
+								for(PxU32 c = 0; c < 4; ++c)
+								{
+									PxVec3 dif = mOriginalContacts[reducedPatch.contactPoints[c]].point - point.point;
+									PxReal d = dif.magnitudeSquared();
+									if(distance > d)
+									{
+										distance = d;
+										index = c;
+									}
+								}
+								if(separation[index] > point.separation)
+								{
+									deepestInd[index] = tmpPatch->startIndex+b;
+									separation[index] = point.separation;
+								}
+
+							}
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+
+						bool chosen[64];
+						PxMemZero(chosen, sizeof(chosen));
+						for(PxU32 i = 0; i < 4; ++i)
+						{
+							reducedPatch.contactPoints[i] = deepestInd[i];
+							chosen[deepestInd[i]] = true;
+						}						
+						
+						for(PxU32 i = 4; i < CONTACT_REDUCTION_MAX_CONTACTS; ++i)
+						{
+							separation[i] = PX_MAX_REAL;
+							deepestInd[i] = 0;
+						}
+						tmpPatch = mIntermediatePatchesPtrs[a];
+						while(tmpPatch)
+						{
+							for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+							{
+								if(!chosen[tmpPatch->startIndex+b])
+								{
+									Gu::ContactPoint& point = mOriginalContacts[tmpPatch->startIndex + b];	
+									for(PxU32 j = 4; j < CONTACT_REDUCTION_MAX_CONTACTS; ++j)
+									{
+										if(point.separation < separation[j])
+										{
+											for(PxU32 k = CONTACT_REDUCTION_MAX_CONTACTS-1; k > j; --k)
+											{
+												separation[k] = separation[k-1];
+												deepestInd[k] = deepestInd[k-1];
+											}
+											separation[j] = point.separation;
+											deepestInd[j] = tmpPatch->startIndex+b;
+											break;
+										}
+									}
+								}
+							}
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+
+						for(PxU32 i = 4; i < CONTACT_REDUCTION_MAX_CONTACTS; ++i)
+						{
+							reducedPatch.contactPoints[i] = deepestInd[i];
+						}
+
+						reducedPatch.numContactPoints = CONTACT_REDUCTION_MAX_CONTACTS;
+					}
+				}
+			}
+			mNumPatches = numReducedPatches;
+		}
+
+	};
+}
+
+}
+
+
+#endif //DY_CONTACT_REDUCTION_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyCorrelationBuffer.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyCorrelationBuffer.h
new file mode 100644
index 00000000..9e4d491d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyCorrelationBuffer.h
@@ -0,0 +1,104 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_CORRELATIONBUFFER_H
+#define DY_CORRELATIONBUFFER_H
+
+#include "PxvConfig.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "DyFrictionPatch.h"
+#include "GuContactBuffer.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+struct PxsMaterialInfo;
+
+namespace Dy
+{
+
+struct CorrelationBuffer
+{
+	static const PxU32 MAX_FRICTION_PATCHES = 32;
+	static const PxU16 LIST_END = 0xffff;
+
+	struct ContactPatchData
+	{
+		PxU16 start;
+		PxU16 next;
+		PxU8 flags;
+		PxU8 count;
+		PxReal staticFriction, dynamicFriction, restitution;
+	};
+
+	// we can have as many contact patches as contacts, unfortunately
+	ContactPatchData	contactPatches[Gu::ContactBuffer::MAX_CONTACTS];
+
+	FrictionPatch	PX_ALIGN(16, frictionPatches[MAX_FRICTION_PATCHES]);
+	PxVec3				PX_ALIGN(16, frictionPatchWorldNormal[MAX_FRICTION_PATCHES]);
+
+	PxU32				frictionPatchContactCounts[MAX_FRICTION_PATCHES];
+	PxU32				correlationListHeads[MAX_FRICTION_PATCHES+1];
+
+	// contact IDs are only used to identify auxiliary contact data when velocity
+	// targets have been set. 
+	PxU16				contactID[MAX_FRICTION_PATCHES][2];
+
+	PxU32 contactPatchCount, frictionPatchCount;
+
+};
+
+bool createContactPatches(CorrelationBuffer& fb, const Gu::ContactPoint* cb, PxU32 contactCount, PxReal normalTolerance);
+
+bool correlatePatches(CorrelationBuffer& fb, 
+					  const Gu::ContactPoint* cb,
+					  const PxTransform& bodyFrame0,
+					  const PxTransform& bodyFrame1,
+					  PxReal normalTolerance,
+					  PxU32 startContactPatchIndex,
+					  PxU32 startFrictionPatchIndex);
+
+void growPatches(CorrelationBuffer& fb,
+				 const Gu::ContactPoint* buffer,
+				 const PxTransform& bodyFrame0,
+				 const PxTransform& bodyFrame1,
+				 PxReal normalTolerance,
+				 PxU32 frictionPatchStartIndex,
+				 PxReal frictionOffsetThreshold);
+
+}
+
+}
+
+#endif //DY_CORRELATIONBUFFER_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyDynamics.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyDynamics.cpp
new file mode 100644
index 00000000..07f3b642
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyDynamics.cpp
@@ -0,0 +1,2950 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "PsTime.h"
+#include "PsAtomic.h"
+#include "PxvDynamics.h"
+
+#include "foundation/PxProfiler.h"
+#include "PxsRigidBody.h"
+#include "PxsContactManager.h"
+#include "DyDynamics.h"
+#include "DyBodyCoreIntegrator.h"
+#include "DySolverCore.h"
+#include "DySolverControl.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DyArticulationContactPrep.h"
+#include "DySolverBody.h"
+
+#include "DyConstraintPrep.h"
+#include "DyConstraintPartition.h"
+#include "DyArticulation.h"
+
+#include "CmFlushPool.h"
+#include "DyArticulationPImpl.h"
+#include "PxsMaterialManager.h"
+#include "DySolverContactPF4.h"
+#include "DyContactReduction.h"
+#include "PxcNpContactPrepShared.h"
+#include "DyContactPrep.h"
+#include "DySolverControlPF.h"
+#include "PxSceneDesc.h"
+#include "PxsSimpleIslandManager.h"
+#include "PxvNphaseImplementationContext.h"
+#include "PxsContactManagerState.h"
+#include "PxsDefaultMemoryManager.h"
+#include "DyContactPrepShared.h"
+  
+//KS - used to turn on/off batched SIMD constraints.
+#define DY_BATCH_CONSTRAINTS 1
+//KS - used to specifically turn on/off batches 1D SIMD constraints.
+#define DY_BATCH_1D 1
+
+namespace physx
+{
+namespace Dy
+{
+
+struct SolverIslandObjects
+{
+	PxsRigidBody**				bodies;	
+	Articulation**				articulations;
+	Dy::Articulation**			articulationOwners;
+	PxsIndexedContactManager*	contactManagers;
+	//PxsIndexedConstraint*		constraints;
+
+	const IG::IslandId*			islandIds;
+	PxU32						numIslands;
+	PxU32*						bodyRemapTable;
+	PxU32*						nodeIndexArray;
+
+	PxSolverConstraintDesc*	constraintDescs;
+	PxSolverConstraintDesc*	orderedConstraintDescs;
+	PxSolverConstraintDesc*	tempConstraintDescs;
+	PxConstraintBatchHeader*	constraintBatchHeaders;
+	Cm::SpatialVector*			motionVelocities;
+	PxsBodyCore**				bodyCoreArray;
+
+	SolverIslandObjects() : bodies(NULL), articulations(NULL), articulationOwners(NULL),
+		contactManagers(NULL), islandIds(NULL), numIslands(0), nodeIndexArray(NULL), constraintDescs(NULL), orderedConstraintDescs(NULL), 
+		tempConstraintDescs(NULL), constraintBatchHeaders(NULL), motionVelocities(NULL), bodyCoreArray(NULL)
+	{
+	}
+};
+
+Context* createDynamicsContext(	PxcNpMemBlockPool* memBlockPool,
+								PxcScratchAllocator& scratchAllocator, Cm::FlushPool& taskPool,
+								PxvSimStats& simStats, PxTaskManager* taskManager, Ps::VirtualAllocatorCallback* allocatorCallback, PxsMaterialManager* materialManager,
+								IG::IslandSim* accurateIslandSim, PxU64 contextID,
+								const bool enableStabilization, const bool useEnhancedDeterminism, const bool useAdaptiveForce
+								)
+{
+	return DynamicsContext::create(	memBlockPool, scratchAllocator, taskPool, simStats, taskManager, allocatorCallback, materialManager, accurateIslandSim,
+									contextID, enableStabilization, useEnhancedDeterminism, useAdaptiveForce);
+}
+
+// PT: TODO: consider removing this function. We already have "createDynamicsContext".
+DynamicsContext* DynamicsContext::create(	PxcNpMemBlockPool* memBlockPool,
+											PxcScratchAllocator& scratchAllocator,
+											Cm::FlushPool& taskPool,
+											PxvSimStats& simStats,
+											PxTaskManager* taskManager,
+											Ps::VirtualAllocatorCallback* allocatorCallback,
+											PxsMaterialManager* materialManager,
+											IG::IslandSim* accurateIslandSim,
+											PxU64 contextID,
+											const bool enableStabilization,
+											const bool useEnhancedDeterminism,
+											const bool useAdaptiveForce
+											)
+{
+	// PT: TODO: inherit from UserAllocated, remove placement new
+	DynamicsContext* dc = reinterpret_cast<DynamicsContext*>(PX_ALLOC(sizeof(DynamicsContext), "DynamicsContext"));
+	if(dc)
+	{
+		new(dc)DynamicsContext(memBlockPool, scratchAllocator, taskPool, simStats, taskManager, allocatorCallback, materialManager, accurateIslandSim, contextID, enableStabilization, useEnhancedDeterminism, useAdaptiveForce);
+	}
+	return dc;
+}
+
+
+void DynamicsContext::destroy()
+{
+	this->~DynamicsContext();
+	PX_FREE(this);
+}
+
+void DynamicsContext::resetThreadContexts()
+{
+	PxcThreadCoherentCacheIterator<ThreadContext, PxcNpMemBlockPool> threadContextIt(mThreadContextPool);
+	ThreadContext* threadContext = threadContextIt.getNext();
+
+	while(threadContext != NULL)
+	{
+		threadContext->reset();
+		threadContext = threadContextIt.getNext();
+	}
+}
+
+
+// =========================== Basic methods
+
+
+DynamicsContext::DynamicsContext(	PxcNpMemBlockPool* memBlockPool,
+									PxcScratchAllocator& scratchAllocator,
+									Cm::FlushPool& taskPool,
+									PxvSimStats& simStats,
+									PxTaskManager* taskManager,
+									Ps::VirtualAllocatorCallback* allocatorCallback,
+									PxsMaterialManager* materialManager,
+									IG::IslandSim* accurateIslandSim,
+									PxU64 contextID,
+									const bool enableStabilization,
+									const bool useEnhancedDeterminism,
+									const bool useAdaptiveForce
+									) : 
+	Dy::Context			(accurateIslandSim, allocatorCallback, simStats, enableStabilization, useEnhancedDeterminism, useAdaptiveForce),
+	mThreadContextPool	(memBlockPool),
+	mMaterialManager	(materialManager),
+	mScratchAllocator	(scratchAllocator),
+	mTaskPool			(taskPool),
+	mTaskManager		(taskManager),
+	mContextID			(contextID)
+{
+	createThresholdStream(*allocatorCallback);
+	createForceChangeThresholdStream(*allocatorCallback);
+	mExceededForceThresholdStream[0] = PX_PLACEMENT_NEW(PX_ALLOC(sizeof(ThresholdStream), PX_DEBUG_EXP("ExceededForceThresholdStream[0]")), ThresholdStream(*allocatorCallback));
+	mExceededForceThresholdStream[1] = PX_PLACEMENT_NEW(PX_ALLOC(sizeof(ThresholdStream), PX_DEBUG_EXP("ExceededForceThresholdStream[1]")), ThresholdStream(*allocatorCallback));
+	mThresholdStreamOut = 0;
+	mCurrentIndex = 0;
+	mWorldSolverBody.linearVelocity = PxVec3(0);
+	mWorldSolverBody.angularState = PxVec3(0);
+	mWorldSolverBodyData.invMass = 0;
+	mWorldSolverBodyData.sqrtInvInertia = PxMat33(PxZero);
+	mWorldSolverBodyData.nodeIndex = IG_INVALID_NODE;
+	mWorldSolverBodyData.reportThreshold = PX_MAX_REAL;
+	mWorldSolverBodyData.penBiasClamp = -PX_MAX_REAL;
+	mWorldSolverBodyData.maxContactImpulse = PX_MAX_REAL;
+	mWorldSolverBody.solverProgress=MAX_PERMITTED_SOLVER_PROGRESS;
+	mWorldSolverBody.maxSolverNormalProgress=MAX_PERMITTED_SOLVER_PROGRESS;
+	mWorldSolverBody.maxSolverFrictionProgress=MAX_PERMITTED_SOLVER_PROGRESS;
+	mWorldSolverBodyData.linearVelocity = mWorldSolverBodyData.angularVelocity = PxVec3(0.f);
+	mWorldSolverBodyData.body2World = PxTransform(PxIdentity);
+	mWorldSolverBodyData.lockFlags = 0;
+	mSolverCore[PxFrictionType::ePATCH] = SolverCoreGeneral::create();
+	mSolverCore[PxFrictionType::eONE_DIRECTIONAL] = SolverCoreGeneralPF::create();
+	mSolverCore[PxFrictionType::eTWO_DIRECTIONAL] = SolverCoreGeneralPF::create();
+}
+
+DynamicsContext::~DynamicsContext()
+{
+	for(PxU32 i = 0; i < PxFrictionType::eFRICTION_COUNT; ++i)
+	{
+		mSolverCore[i]->destroyV();
+	}
+
+	if(mExceededForceThresholdStream[0])
+	{
+		mExceededForceThresholdStream[0]->~ThresholdStream();
+		PX_FREE(mExceededForceThresholdStream[0]);
+	}
+	mExceededForceThresholdStream[0] = NULL;
+
+	if(mExceededForceThresholdStream[1])
+	{
+		mExceededForceThresholdStream[1]->~ThresholdStream();
+		PX_FREE(mExceededForceThresholdStream[1]);
+	}
+	mExceededForceThresholdStream[1] = NULL;
+
+}
+
+#if PX_ENABLE_SIM_STATS
+void DynamicsContext::addThreadStats(const ThreadContext::ThreadSimStats& stats)
+{
+	mSimStats.mNbActiveConstraints += stats.numActiveConstraints;
+	mSimStats.mNbActiveDynamicBodies += stats.numActiveDynamicBodies;
+	mSimStats.mNbActiveKinematicBodies += stats.numActiveKinematicBodies;
+	mSimStats.mNbAxisSolverConstraints += stats.numAxisSolverConstraints;
+}
+#endif
+
+// =========================== Solve methods!
+
+void DynamicsContext::setDescFromIndices(PxSolverConstraintDesc& desc, const PxsIndexedInteraction& constraint, const PxU32 solverBodyOffset)
+{
+	PX_COMPILE_TIME_ASSERT(PxsIndexedInteraction::eBODY == 0);
+	PX_COMPILE_TIME_ASSERT(PxsIndexedInteraction::eKINEMATIC == 1);
+	const PxU32 offsetMap[] = {solverBodyOffset, 0};
+	//const PxU32 offsetMap[] = {mKinematicCount, 0};
+
+	if(constraint.indexType0 == PxsIndexedInteraction::eARTICULATION)
+	{
+		Articulation* a = getArticulation(constraint.articulation0);
+		desc.articulationA = a->getFsDataPtr();
+		desc.articulationALength = Ps::to16(a->getSolverDataSize());
+		PX_ASSERT(0==(desc.articulationALength & 0x0f));
+		desc.linkIndexA = Ps::to16(a->getLinkIndex(constraint.articulation0));
+	}
+	else
+	{
+		desc.linkIndexA = PxSolverConstraintDesc::NO_LINK;
+		//desc.articulationALength = 0; //this is unioned with bodyADataIndex
+		/*desc.bodyA = constraint.indexType0 == PxsIndexedInteraction::eWORLD ? &mWorldSolverBody
+																			: &mSolverBodyPool[(PxU32)constraint.solverBody0 + offsetMap[constraint.indexType0]];
+		desc.bodyADataIndex = PxU16(constraint.indexType0 == PxsIndexedInteraction::eWORLD ? 0
+																			: (PxU16)constraint.solverBody0 + 1 + offsetMap[constraint.indexType0]);*/
+
+		desc.bodyA = constraint.indexType0 == PxsIndexedInteraction::eWORLD ? &mWorldSolverBody
+																			: &mSolverBodyPool[PxU32(constraint.solverBody0) + offsetMap[constraint.indexType0]];
+		desc.bodyADataIndex = PxU16(constraint.indexType0 == PxsIndexedInteraction::eWORLD ? 0
+																			: PxU16(constraint.solverBody0) + 1 + offsetMap[constraint.indexType0]);
+	}
+
+	if(constraint.indexType1 == PxsIndexedInteraction::eARTICULATION)
+	{
+		Articulation* a = getArticulation(constraint.articulation1);
+		desc.articulationB = a->getFsDataPtr();
+		desc.articulationBLength = Ps::to16(a->getSolverDataSize());
+		PX_ASSERT(0==(desc.articulationBLength & 0x0f));
+		desc.linkIndexB = Ps::to16(a->getLinkIndex(constraint.articulation1));
+	}
+	else
+	{
+		desc.linkIndexB = PxSolverConstraintDesc::NO_LINK;
+		//desc.articulationBLength = 0; //this is unioned with bodyBDataIndex
+		desc.bodyB = constraint.indexType1 == PxsIndexedInteraction::eWORLD ? &mWorldSolverBody
+																			: &mSolverBodyPool[PxU32(constraint.solverBody1) + offsetMap[constraint.indexType1]];
+		desc.bodyBDataIndex = PxU16(constraint.indexType1 == PxsIndexedInteraction::eWORLD ? 0
+																			: PxU16(constraint.solverBody1) + 1 + offsetMap[constraint.indexType1]);
+	}
+}
+
+void DynamicsContext::setDescFromIndices(PxSolverConstraintDesc& desc, IG::EdgeIndex edgeIndex, const IG::SimpleIslandManager& islandManager,
+	PxU32* bodyRemap, const PxU32 solverBodyOffset)
+{
+	PX_COMPILE_TIME_ASSERT(PxsIndexedInteraction::eBODY == 0);
+	PX_COMPILE_TIME_ASSERT(PxsIndexedInteraction::eKINEMATIC == 1);
+
+	const IG::IslandSim& islandSim = islandManager.getAccurateIslandSim();
+
+	IG::NodeIndex node1 = islandSim.getNodeIndex1(edgeIndex);
+	if (node1.isStaticBody())
+	{
+		desc.bodyA = &mWorldSolverBody;
+		desc.bodyADataIndex = 0;
+		desc.linkIndexA = PxSolverConstraintDesc::NO_LINK;
+	}
+	else
+	{
+		const IG::Node& node = islandSim.getNode(node1);
+		if (node.getNodeType() == IG::Node::eARTICULATION_TYPE)
+		{
+			Dy::Articulation* a = islandSim.getLLArticulation(node1);
+			desc.articulationA = a->getFsDataPtr();
+			desc.articulationALength = Ps::to16(a->getSolverDataSize());
+			PX_ASSERT(0 == (desc.articulationALength & 0x0f));
+			desc.linkIndexA = Ps::to16(node1.articulationLinkId());
+		}
+		else
+		{
+			PxU32 activeIndex = islandSim.getActiveNodeIndex(node1);
+			PxU32 index = node.isKinematic() ? activeIndex : bodyRemap[activeIndex] + solverBodyOffset;
+			desc.bodyA = &mSolverBodyPool[index];
+			desc.bodyADataIndex = Ps::to16(index + 1);
+			desc.linkIndexA = PxSolverConstraintDesc::NO_LINK;
+		}
+	}
+
+	IG::NodeIndex node2 = islandSim.getNodeIndex2(edgeIndex);
+	if (node2.isStaticBody())
+	{
+		desc.bodyB = &mWorldSolverBody;
+		desc.bodyBDataIndex = 0;
+		desc.linkIndexB = PxSolverConstraintDesc::NO_LINK;
+	}
+	else
+	{
+		const IG::Node& node = islandSim.getNode(node2);
+		if (node.getNodeType() == IG::Node::eARTICULATION_TYPE)
+		{
+			Dy::Articulation* a = islandSim.getLLArticulation(node2);
+			desc.articulationB = a->getFsDataPtr();
+			desc.articulationBLength = Ps::to16(a->getSolverDataSize());
+			PX_ASSERT(0 == (desc.articulationBLength & 0x0f));
+			desc.linkIndexB = Ps::to16(node2.articulationLinkId());
+		}
+		else
+		{
+			PxU32 activeIndex = islandSim.getActiveNodeIndex(node2);
+			PxU32 index = node.isKinematic() ? activeIndex : bodyRemap[activeIndex] + solverBodyOffset;
+			desc.bodyB = &mSolverBodyPool[index];
+			desc.bodyBDataIndex = Ps::to16(index + 1);
+			desc.linkIndexB = PxSolverConstraintDesc::NO_LINK;
+		}
+	}
+}
+
+
+class PxsPreIntegrateTask : public Cm::Task
+{
+	PxsPreIntegrateTask& operator=(const PxsPreIntegrateTask&);
+public:
+	PxsPreIntegrateTask(	DynamicsContext&		context,
+							PxsBodyCore*const*		bodyArray,
+							PxsRigidBody*const*		originalBodyArray,
+							PxU32 const*				nodeIndexArray,
+							PxSolverBody*					solverBodies,
+							PxSolverBodyData*				solverBodyDataPool,
+							PxF32						dt,
+							PxU32						numBodies,
+							volatile PxU32*				maxSolverPositionIterations,
+							volatile PxU32*				maxSolverVelocityIterations,
+							const PxU32					startIndex,
+							const PxU32					numToIntegrate,
+							const PxVec3&				gravity) :
+		mContext(context),
+		mBodyArray(bodyArray),
+		mOriginalBodyArray(originalBodyArray),
+		mNodeIndexArray(nodeIndexArray),
+		mSolverBodies(solverBodies),
+		mSolverBodyDataPool(solverBodyDataPool),
+		mDt(dt),
+		mNumBodies(numBodies),
+		mMaxSolverPositionIterations(maxSolverPositionIterations),
+		mMaxSolverVelocityIterations(maxSolverVelocityIterations),
+		mStartIndex(startIndex),
+		mNumToIntegrate(numToIntegrate),
+		mGravity(gravity)
+	{}
+
+	virtual void runInternal();
+
+	virtual const char* getName() const
+	{
+		return "PxsDynamics.preIntegrate";
+	}
+
+public:
+	DynamicsContext&			mContext;
+	PxsBodyCore*const*			mBodyArray;
+	PxsRigidBody*const*			mOriginalBodyArray;
+	PxU32 const*			mNodeIndexArray;
+	PxSolverBody*				mSolverBodies;
+	PxSolverBodyData*			mSolverBodyDataPool;
+	PxF32						mDt;
+	PxU32						mNumBodies;
+	volatile PxU32*				mMaxSolverPositionIterations;
+	volatile PxU32*				mMaxSolverVelocityIterations;
+	PxU32						mStartIndex;
+	PxU32						mNumToIntegrate;
+	PxVec3						mGravity;
+
+};
+
+
+
+class PxsParallelSolverTask : public Cm::Task
+{
+	PxsParallelSolverTask& operator=(PxsParallelSolverTask&);
+public:
+
+	PxsParallelSolverTask(SolverIslandParams& params, DynamicsContext& context, PxFrictionType::Enum frictionType, IG::IslandSim& islandSim)
+		: mParams(params), mContext(context), mFrictionType(frictionType), mIslandSim(islandSim)
+	{
+	}
+
+	virtual void runInternal()
+	{
+		solveParallel(mContext, mParams, mIslandSim);
+	}
+
+	virtual const char* getName() const
+	{
+		return "PxsDynamics.parallelSolver";
+	}
+
+	SolverIslandParams&						mParams;
+	DynamicsContext&						mContext;
+	PxFrictionType::Enum					mFrictionType;
+	IG::IslandSim&							mIslandSim;
+};
+
+
+#define PX_CONTACT_REDUCTION 1
+
+class PxsSolverConstraintPostProcessTask : public Cm::Task
+{
+	PxsSolverConstraintPostProcessTask& operator=(const PxsSolverConstraintPostProcessTask&);
+public:
+
+	PxsSolverConstraintPostProcessTask(DynamicsContext& context,
+		ThreadContext& threadContext,
+		const SolverIslandObjects& objects,				  
+		const PxU32 solverBodyOffset,
+		PxU32 startIndex,
+		PxU32 stride,
+		PxsMaterialManager* materialManager,
+		PxsContactManagerOutputIterator& iterator) :
+		mContext(context), 
+		mThreadContext(threadContext),
+		mObjects(objects),
+		mSolverBodyOffset(solverBodyOffset),
+		mStartIndex(startIndex),
+		mStride(stride),
+		mMaterialManager(materialManager),
+		mOutputs(iterator)
+	{}
+
+	void mergeContacts(CompoundContactManager& header, ThreadContext& threadContext)
+	{
+		Gu::ContactBuffer& buffer = threadContext.mContactBuffer;
+		PxsMaterialInfo materialInfo[Gu::ContactBuffer::MAX_CONTACTS];
+		PxU32 size = 0;
+
+		for(PxU32 a = 0; a < header.mStride; ++a)
+		{
+			PxsContactManager* manager = mThreadContext.orderedContactList[a+header.mStartIndex]->contactManager;
+			PxcNpWorkUnit& unit = manager->getWorkUnit();
+			PxsContactManagerOutput& output = mOutputs.getContactManager(unit.mNpIndex);
+			PxContactStreamIterator iter(output.contactPatches, output.contactPoints, output.getInternalFaceIndice(), output.nbPatches, output.nbContacts);
+
+			PxU32 origSize = size;
+			PX_UNUSED(origSize);
+			if(!iter.forceNoResponse)
+			{
+				while(iter.hasNextPatch())
+				{
+					iter.nextPatch();
+					while(iter.hasNextContact())
+					{
+						PX_ASSERT(size < Gu::ContactBuffer::MAX_CONTACTS);
+						iter.nextContact();
+						PxsMaterialInfo& info = materialInfo[size];
+						Gu::ContactPoint& point = buffer.contacts[size++];
+						point.dynamicFriction = iter.getDynamicFriction();
+						point.staticFriction = iter.getStaticFriction();
+						point.restitution = iter.getRestitution();
+						point.internalFaceIndex1 = iter.getFaceIndex1();
+						point.materialFlags = PxU8(iter.getMaterialFlags());
+						point.maxImpulse = iter.getMaxImpulse();
+						point.targetVel = iter.getTargetVel();
+						point.normal = iter.getContactNormal();
+						point.point = iter.getContactPoint();
+						point.separation = iter.getSeparation();
+						info.mMaterialIndex0 = iter.getMaterialIndex0();
+						info.mMaterialIndex1 = iter.getMaterialIndex1();
+					}
+				}
+				PX_ASSERT(output.nbContacts == (size - origSize));
+			}	
+		}
+
+		PxU32 origSize = size;
+#if PX_CONTACT_REDUCTION
+		ContactReduction<6> reduction(buffer.contacts, materialInfo, size);
+		reduction.reduceContacts();
+		//OK, now we write back the contacts...
+
+		PxU8 histo[Gu::ContactBuffer::MAX_CONTACTS];
+		PxMemZero(histo, sizeof(histo));
+
+		size = 0;
+		for(PxU32 a = 0; a < reduction.mNumPatches; ++a)
+		{
+			ReducedContactPatch& patch = reduction.mPatches[a];
+			for(PxU32 b = 0; b < patch.numContactPoints; ++b)
+			{
+				histo[patch.contactPoints[b]] = 1;
+				++size;
+			}
+		}
+#endif
+
+		PxU16* PX_RESTRICT data = reinterpret_cast<PxU16*>(threadContext.mConstraintBlockStream.reserve(size * sizeof(PxU16), mThreadContext.mConstraintBlockManager));
+		header.forceBufferList = data;
+		
+
+#if PX_CONTACT_REDUCTION
+		const PxU32 reservedSize = size;
+		PX_UNUSED(reservedSize);
+		size = 0;
+		for(PxU32 a = 0; a < origSize; ++a)
+		{
+			if(histo[a])
+			{
+				if(size != a)
+				{
+					buffer.contacts[size] = buffer.contacts[a];
+					materialInfo[size] = materialInfo[a];
+				}
+				data[size] = Ps::to16(a);
+				size++;
+			}
+		}
+		PX_ASSERT(reservedSize >= size);
+#else
+		for(PxU32 a = 0; a < size; ++a)
+			data[a] = a;
+#endif
+
+
+		PxU32 contactForceByteSize = size * sizeof(PxReal);
+
+
+		PxsContactManagerOutput& output = mOutputs.getContactManager(header.unit->mNpIndex);
+
+		PxU16 compressedContactSize;
+
+		physx::writeCompressedContact(buffer.contacts, size, NULL, output.nbContacts, output.contactPatches, output.contactPoints, compressedContactSize,
+			reinterpret_cast<PxReal*&>(output.contactForces), contactForceByteSize, mMaterialManager, false, 
+			false, materialInfo, output.nbPatches, 0, &mThreadContext.mConstraintBlockManager, &threadContext.mConstraintBlockStream, false);
+	}
+
+	virtual void runInternal()
+	{
+		PxU32 endIndex = mStartIndex + mStride;
+
+		ThreadContext* threadContext = mContext.getThreadContext();
+		//TODO - we need to do this somewhere else
+		//threadContext->mContactBlockStream.reset();
+		threadContext->mConstraintBlockStream.reset();
+
+		for(PxU32 a = mStartIndex; a < endIndex; ++a)
+		{
+			mergeContacts(mThreadContext.compoundConstraints[a], *threadContext);
+		}
+		mContext.putThreadContext(threadContext);
+	}
+
+	virtual const char* getName() const { return "PxsDynamics.solverConstraintPostProcess"; }
+
+
+	DynamicsContext&			mContext;
+	ThreadContext&			mThreadContext;
+	const SolverIslandObjects		mObjects;
+	PxU32						mSolverBodyOffset;
+	PxU32						mStartIndex;
+	PxU32						mStride;
+	PxsMaterialManager*			mMaterialManager;
+	PxsContactManagerOutputIterator& mOutputs;
+};
+
+class PxsForceThresholdTask  : public Cm::Task
+{
+	DynamicsContext&		mDynamicsContext;
+
+	PxsForceThresholdTask& operator=(const PxsForceThresholdTask&);
+public:
+
+	PxsForceThresholdTask(DynamicsContext& context): mDynamicsContext(context) 
+	{
+	}
+
+	void createForceChangeThresholdStream()
+	{
+		ThresholdStream& thresholdStream = mDynamicsContext.getThresholdStream();
+		//bool haveThresholding = thresholdStream.size()!=0;
+
+		ThresholdTable& thresholdTable = mDynamicsContext.getThresholdTable();
+		thresholdTable.build(thresholdStream);
+
+		//generate current force exceeded threshold stream
+		ThresholdStream& curExceededForceThresholdStream = *mDynamicsContext.mExceededForceThresholdStream[mDynamicsContext.mCurrentIndex];
+		ThresholdStream& preExceededForceThresholdStream = *mDynamicsContext.mExceededForceThresholdStream[1 - mDynamicsContext.mCurrentIndex];
+		curExceededForceThresholdStream.forceSize_Unsafe(0);
+
+		//fill in the currrent exceeded force threshold stream
+		for(PxU32 i=0; i<thresholdTable.mPairsSize; ++i)
+		{
+			ThresholdTable::Pair& pair = thresholdTable.mPairs[i];
+			ThresholdStreamElement& elem = thresholdStream[pair.thresholdStreamIndex];
+			if(pair.accumulatedForce > elem.threshold * mDynamicsContext.mDt)
+			{
+				elem.accumulatedForce = pair.accumulatedForce;
+				curExceededForceThresholdStream.pushBack(elem);
+			}
+		}
+
+		ThresholdStream& forceChangeThresholdStream = mDynamicsContext.getForceChangedThresholdStream();
+		forceChangeThresholdStream.forceSize_Unsafe(0);
+		Ps::Array<PxU32>& forceChangeMask = mDynamicsContext.mExceededForceThresholdStreamMask;
+
+		const PxU32 nbPreExceededForce = preExceededForceThresholdStream.size();
+		const PxU32 nbCurExceededForce = curExceededForceThresholdStream.size();
+
+		//generate force change thresholdStream
+		if(nbPreExceededForce)
+		{
+			thresholdTable.build(preExceededForceThresholdStream);
+
+			//set force change mask
+			const PxU32 nbTotalExceededForce = nbPreExceededForce + nbCurExceededForce;
+			forceChangeMask.reserve(nbTotalExceededForce);
+			forceChangeMask.forceSize_Unsafe(nbTotalExceededForce);
+			
+			//initialize the forceChangeMask
+			for (PxU32 i = 0; i < nbTotalExceededForce; ++i)
+				forceChangeMask[i] = 1;
+
+			for(PxU32 i=0; i< nbCurExceededForce; ++i)
+			{
+				ThresholdStreamElement& curElem = curExceededForceThresholdStream[i];
+				
+				PxU32 pos;
+				if(thresholdTable.check(preExceededForceThresholdStream, curElem, pos))
+				{
+					forceChangeMask[pos] = 0;
+					forceChangeMask[i + nbPreExceededForce] = 0;
+				}
+			}
+
+			//create force change threshold stream
+			for(PxU32 i=0; i<nbTotalExceededForce; ++i)
+			{
+				const PxU32 hasForceChange = forceChangeMask[i];
+				if(hasForceChange)
+				{
+					bool lostPair = (i < nbPreExceededForce);
+					ThresholdStreamElement& elem = lostPair ?  preExceededForceThresholdStream[i] : curExceededForceThresholdStream[i - nbPreExceededForce];
+					ThresholdStreamElement elt;
+					elt = elem;
+					elt.accumulatedForce = lostPair ? 0.f : elem.accumulatedForce;
+					forceChangeThresholdStream.pushBack(elt);
+				}
+				else
+				{
+					//persistent pair
+					if (i < nbPreExceededForce)
+					{
+						ThresholdStreamElement& elem = preExceededForceThresholdStream[i];
+						ThresholdStreamElement elt;
+						elt = elem;
+						elt.accumulatedForce = elem.accumulatedForce;
+						forceChangeThresholdStream.pushBack(elt);
+					}
+				}
+			}
+		}
+		else
+		{
+			forceChangeThresholdStream.reserve(nbCurExceededForce);
+			forceChangeThresholdStream.forceSize_Unsafe(nbCurExceededForce);
+			PxMemCopy(forceChangeThresholdStream.begin(), curExceededForceThresholdStream.begin(), sizeof(ThresholdStreamElement) * nbCurExceededForce);
+		}
+	}
+
+	virtual void runInternal()
+	{
+		mDynamicsContext.getThresholdStream().forceSize_Unsafe(PxU32(mDynamicsContext.mThresholdStreamOut));
+		createForceChangeThresholdStream();
+	}
+
+	virtual const char* getName() const { return "PxsDynamics.createForceChangeThresholdStream"; }
+};
+
+
+struct ConstraintLess
+{
+	bool operator()(const PxSolverConstraintDesc& left, const PxSolverConstraintDesc& right) const
+	{
+		return reinterpret_cast<Constraint*>(left.constraint)->index > reinterpret_cast<Constraint*>(right.constraint)->index;
+	}
+};
+
+struct ArticulationSortPredicate
+{
+	bool operator()(const PxsIndexedContactManager*& left, const PxsIndexedContactManager*& right) const
+	{
+		return left->contactManager->getWorkUnit().index < right->contactManager->getWorkUnit().index;
+	}
+};
+
+class SolverArticulationUpdateTask : public Cm::Task
+{
+	
+
+	ThreadContext& mIslandThreadContext;
+
+	Articulation** mArticulations;
+	ArticulationSolverDesc* mArticulationDescArray;
+	PxU32 mNbToProcess;
+
+	Dy::DynamicsContext& mContext;
+	PxU32 mStartIdx;
+
+public:
+
+	static const PxU32 NbArticulationsPerTask = 8;
+
+	SolverArticulationUpdateTask(ThreadContext& islandThreadContext, Articulation** articulations, ArticulationSolverDesc* articulationDescArray, PxU32 nbToProcess, Dy::DynamicsContext& context,
+		PxU32 startIdx):
+		mIslandThreadContext(islandThreadContext), mArticulations(articulations), mArticulationDescArray(articulationDescArray), mNbToProcess(nbToProcess), mContext(context), mStartIdx(startIdx)
+	{
+	}
+
+	virtual const char* getName() const { return "SolverArticulationUpdateTask"; }
+
+	virtual void runInternal()
+	{
+		ThreadContext& threadContext = *mContext.getThreadContext();
+
+		threadContext.mConstraintBlockStream.reset(); //Clear in case there's some left-over memory in this context, for which the block has already been freed 
+		PxU32 maxVelIters = 0;
+		PxU32 maxPosIters = 0;
+		PxU32 maxArticulationLength = 0;
+		PxU32 maxSolverArticLength = 0;
+
+		PxU32 startIdx = mStartIdx;
+		for(PxU32 i=0;i<mNbToProcess; i++)
+		{
+			Articulation& a = *(mArticulations[i]);
+			a.getSolverDesc(mArticulationDescArray[i]);
+
+			PxU32 acCount, descCount;
+			
+			descCount = ArticulationPImpl::computeUnconstrainedVelocities(mArticulationDescArray[i], mContext.mDt, threadContext.mConstraintBlockStream, 
+				mIslandThreadContext.mContactDescPtr + startIdx, acCount, mContext.getScratchAllocator(), 
+				mIslandThreadContext.mConstraintBlockManager, mContext.getGravity(), mContext.getContextId());
+
+			mArticulationDescArray[i].numInternalConstraints = Ps::to8(descCount);
+
+			maxArticulationLength = PxMax(maxArticulationLength, PxU32(mArticulationDescArray[i].totalDataSize));
+			maxSolverArticLength = PxMax(maxSolverArticLength, PxU32(mArticulationDescArray[i].solverDataSize));
+
+			const PxU16 iterWord = a.getIterationCounts();
+			maxVelIters = PxMax<PxU32>(PxU32(iterWord >> 8),	maxVelIters);
+			maxPosIters = PxMax<PxU32>(PxU32(iterWord & 0xff), maxPosIters);
+			startIdx += DY_ARTICULATION_MAX_SIZE;
+		}
+		Ps::atomicMax(reinterpret_cast<PxI32*>(&mIslandThreadContext.mMaxSolverPositionIterations), PxI32(maxPosIters));
+		Ps::atomicMax(reinterpret_cast<PxI32*>(&mIslandThreadContext.mMaxSolverVelocityIterations), PxI32(maxVelIters));
+		Ps::atomicMax(reinterpret_cast<PxI32*>(&mIslandThreadContext.mMaxArticulationLength), PxI32(maxArticulationLength));
+		Ps::atomicMax(reinterpret_cast<PxI32*>(&mIslandThreadContext.mMaxArticulationSolverLength), PxI32(maxSolverArticLength));
+
+		mContext.putThreadContext(&threadContext);
+	}
+
+private:
+	PX_NOCOPY(SolverArticulationUpdateTask)
+};
+
+
+struct EnhancedSortPredicate
+{
+	bool operator()(const PxsIndexedContactManager& left, const PxsIndexedContactManager& right) const
+	{
+		PxcNpWorkUnit& unit0 = left.contactManager->getWorkUnit();
+		PxcNpWorkUnit& unit1 = right.contactManager->getWorkUnit();
+		return (unit0.mTransformCache0 < unit1.mTransformCache0) ||
+			((unit0.mTransformCache0 == unit1.mTransformCache0) && (unit0.mTransformCache1 < unit1.mTransformCache1));
+	}
+};
+
+
+class PxsSolverStartTask : public Cm::Task
+{
+	PxsSolverStartTask& operator=(const PxsSolverStartTask&);
+public:
+
+	PxsSolverStartTask(DynamicsContext& context,
+		IslandContext& islandContext,
+		const SolverIslandObjects& objects,
+		const PxU32 solverBodyOffset,
+		const PxU32 kinematicCount,
+		IG::SimpleIslandManager& islandManager,
+		PxU32* bodyRemapTable,
+		PxsMaterialManager* materialManager,
+		PxsContactManagerOutputIterator& iterator,
+		bool enhancedDeterminism
+		) :
+		mContext				(context), 
+		mIslandContext			(islandContext),
+		mObjects				(objects),
+		mSolverBodyOffset		(solverBodyOffset),
+		mKinematicCount			(kinematicCount),
+		mIslandManager			(islandManager),
+		mBodyRemapTable			(bodyRemapTable),
+		mMaterialManager		(materialManager),
+		mOutputs				(iterator),
+		mEnhancedDeterminism	(enhancedDeterminism)
+	{}
+
+	void startTasks()
+	{
+		PX_PROFILE_ZONE("Dynamics.solveGroup", mContext.getContextId());
+		{
+			ThreadContext& mThreadContext = *mContext.getThreadContext();
+
+			mIslandContext.mThreadContext = &mThreadContext;
+
+			mThreadContext.mMaxSolverPositionIterations = 0;
+			mThreadContext.mMaxSolverVelocityIterations = 0;
+			mThreadContext.mAxisConstraintCount = 0;
+			mThreadContext.mContactDescPtr = mThreadContext.contactConstraintDescArray;
+			mThreadContext.mFrictionDescPtr = mThreadContext.frictionConstraintDescArray.begin();
+			mThreadContext.mNumDifferentBodyConstraints = 0;
+			mThreadContext.mNumSelfConstraintBlocks = 0;
+			mThreadContext.mNumSelfConstraints = 0;
+			mThreadContext.mNumDifferentBodyFrictionConstraints = 0;
+			mThreadContext.mNumSelfConstraintFrictionBlocks = 0;
+			mThreadContext.mNumSelfFrictionConstraints = 0;
+			mThreadContext.numContactConstraintBatches = 0;
+			mThreadContext.contactDescArraySize = 0;
+
+
+			mThreadContext.contactConstraintDescArray = mObjects.constraintDescs;
+			mThreadContext.orderedContactConstraints = mObjects.orderedConstraintDescs;
+			mThreadContext.mContactDescPtr = mObjects.constraintDescs;
+			mThreadContext.tempConstraintDescArray = mObjects.tempConstraintDescs;
+			mThreadContext.contactConstraintBatchHeaders = mObjects.constraintBatchHeaders;
+			mThreadContext.motionVelocityArray = mObjects.motionVelocities;
+			mThreadContext.mBodyCoreArray = mObjects.bodyCoreArray;
+			mThreadContext.mRigidBodyArray = mObjects.bodies;
+			mThreadContext.mArticulationArray = mObjects.articulations;
+			mThreadContext.bodyRemapTable = mObjects.bodyRemapTable;
+			mThreadContext.mNodeIndexArray = mObjects.nodeIndexArray;
+
+			const PxU32 frictionConstraintCount = mContext.getFrictionType() == PxFrictionType::ePATCH ? 0 : PxU32(mIslandContext.mCounts.contactManagers);
+			mThreadContext.resizeArrays(frictionConstraintCount, mIslandContext.mCounts.articulations);
+
+			PxsBodyCore** PX_RESTRICT bodyArrayPtr = mThreadContext.mBodyCoreArray;
+			PxsRigidBody** PX_RESTRICT rigidBodyPtr = mThreadContext.mRigidBodyArray;
+			Articulation** PX_RESTRICT articulationPtr = mThreadContext.mArticulationArray;
+			PxU32* PX_RESTRICT bodyRemapTable = mThreadContext.bodyRemapTable;
+			PxU32* PX_RESTRICT nodeIndexArray = mThreadContext.mNodeIndexArray;
+
+			PxU32 nbIslands = mObjects.numIslands;
+			const IG::IslandId* const islandIds = mObjects.islandIds;
+
+			const IG::IslandSim& islandSim = mIslandManager.getAccurateIslandSim();
+
+			PxU32 bodyIndex = 0, articIndex = 0;
+			for(PxU32 i = 0; i < nbIslands; ++i)
+			{
+				const IG::Island& island = islandSim.getIsland(islandIds[i]);
+
+				IG::NodeIndex currentIndex = island.mRootNode;
+
+				while(currentIndex.isValid())
+				{
+					const IG::Node& node = islandSim.getNode(currentIndex);
+
+					if(node.getNodeType() == IG::Node::eARTICULATION_TYPE)
+					{
+						articulationPtr[articIndex++] = node.getArticulation();
+					}
+					else
+					{
+						PxsRigidBody* rigid = node.getRigidBody();
+						PX_ASSERT(bodyIndex < (mIslandContext.mCounts.bodies + mContext.mKinematicCount + 1));
+						rigidBodyPtr[bodyIndex] = rigid;
+						bodyArrayPtr[bodyIndex] = &rigid->getCore();
+						nodeIndexArray[bodyIndex] = currentIndex.index();
+						bodyRemapTable[islandSim.getActiveNodeIndex(currentIndex)] = bodyIndex++;
+					}
+
+					currentIndex = node.mNextNode;
+				}
+			}
+
+
+			PxsIndexedContactManager* indexedManagers = mObjects.contactManagers;
+
+			PxU32 currentContactIndex = 0;
+			for(PxU32 i = 0; i < nbIslands; ++i)
+			{
+				const IG::Island& island = islandSim.getIsland(islandIds[i]);
+
+				IG::EdgeIndex contactEdgeIndex = island.mFirstEdge[IG::Edge::eCONTACT_MANAGER];
+
+				while(contactEdgeIndex != IG_INVALID_EDGE)
+				{
+					const IG::Edge& edge = islandSim.getEdge(contactEdgeIndex);
+
+					PxsContactManager* contactManager = mIslandManager.getContactManager(contactEdgeIndex);
+
+					if(contactManager)
+					{
+						const IG::NodeIndex nodeIndex1 = islandSim.getNodeIndex1(contactEdgeIndex);
+						const IG::NodeIndex nodeIndex2 = islandSim.getNodeIndex2(contactEdgeIndex);
+
+						PxsIndexedContactManager& indexedManager = indexedManagers[currentContactIndex++];
+						indexedManager.contactManager = contactManager;
+
+						PX_ASSERT(!nodeIndex1.isStaticBody());
+						{
+							const IG::Node& node1 = islandSim.getNode(nodeIndex1);
+
+							//Is it an articulation or not???
+							if(node1.getNodeType() == IG::Node::eARTICULATION_TYPE)
+							{
+								indexedManager.indexType0 = PxsIndexedInteraction::eARTICULATION;
+								indexedManager.solverBody0 = size_t(node1.getArticulation()) | nodeIndex1.articulationLinkId();
+							}
+							else
+							{
+								if(node1.isKinematic())
+								{
+									indexedManager.indexType0 = PxsIndexedInteraction::eKINEMATIC;
+									indexedManager.solverBody0 = islandSim.getActiveNodeIndex(nodeIndex1);
+								}
+								else
+								{
+									indexedManager.indexType0 = PxsIndexedInteraction::eBODY;
+									indexedManager.solverBody0 = bodyRemapTable[islandSim.getActiveNodeIndex(nodeIndex1)];
+								}
+								PX_ASSERT(indexedManager.solverBody0 < (mIslandContext.mCounts.bodies + mContext.mKinematicCount + 1));
+							}
+
+						}
+
+						if(nodeIndex2.isStaticBody())
+						{
+							indexedManager.indexType1 = PxsIndexedInteraction::eWORLD;
+						}
+						else
+						{
+							const IG::Node& node2 = islandSim.getNode(nodeIndex2);
+
+							//Is it an articulation or not???
+							if(node2.getNodeType() == IG::Node::eARTICULATION_TYPE)
+							{
+								indexedManager.indexType1 = PxsIndexedInteraction::eARTICULATION;
+								indexedManager.solverBody1 = size_t(node2.getArticulation()) | nodeIndex2.articulationLinkId();
+							}
+							else
+							{
+								if(node2.isKinematic())
+								{
+									indexedManager.indexType1 = PxsIndexedInteraction::eKINEMATIC;
+									indexedManager.solverBody1 = islandSim.getActiveNodeIndex(nodeIndex2);
+								}
+								else
+								{
+									indexedManager.indexType1 = PxsIndexedInteraction::eBODY;
+									indexedManager.solverBody1 = bodyRemapTable[islandSim.getActiveNodeIndex(nodeIndex2)];
+								}
+								PX_ASSERT(indexedManager.solverBody1 < (mIslandContext.mCounts.bodies + mContext.mKinematicCount + 1));
+							}
+						}
+
+					}
+					contactEdgeIndex = edge.mNextIslandEdge;
+				}
+			}
+
+			if (mEnhancedDeterminism)
+			{
+				Ps::sort(indexedManagers, currentContactIndex, EnhancedSortPredicate());
+			}
+
+			mIslandContext.mCounts.contactManagers = currentContactIndex;
+		}
+	}
+
+	void integrate()
+	{
+		ThreadContext& mThreadContext = *mIslandContext.mThreadContext;
+		PxSolverBody* solverBodies = mContext.mSolverBodyPool.begin() + mSolverBodyOffset;
+		PxSolverBodyData* solverBodyData = mContext.mSolverBodyDataPool.begin() + mSolverBodyOffset;
+
+		{			
+			PX_PROFILE_ZONE("Dynamics.updateVelocities", mContext.getContextId());
+
+			mContext.preIntegrationParallel(	
+				mContext.mDt,
+				mThreadContext.mBodyCoreArray,
+				mObjects.bodies,
+				mThreadContext.mNodeIndexArray,
+				mIslandContext.mCounts.bodies,
+				solverBodies,
+				solverBodyData,
+				mThreadContext.motionVelocityArray,
+				mThreadContext.mMaxSolverPositionIterations,
+				mThreadContext.mMaxSolverVelocityIterations,
+				*mCont
+				);
+		}
+	}
+
+	void articulationTask()
+	{
+		ThreadContext& mThreadContext = *mIslandContext.mThreadContext;
+		ArticulationSolverDesc* articulationDescArray = mThreadContext.getArticulations().begin();
+
+		for(PxU32 i=0;i<mIslandContext.mCounts.articulations; i+= SolverArticulationUpdateTask::NbArticulationsPerTask)
+		{
+
+			SolverArticulationUpdateTask* task = PX_PLACEMENT_NEW(mContext.getTaskPool().allocate(sizeof(SolverArticulationUpdateTask)), SolverArticulationUpdateTask)(mThreadContext, 
+				&mObjects.articulations[i], &articulationDescArray[i], PxMin(SolverArticulationUpdateTask::NbArticulationsPerTask, mIslandContext.mCounts.articulations - i), mContext,
+				i*DY_ARTICULATION_MAX_SIZE);
+
+			task->setContinuation(mCont);
+			task->removeReference();
+	
+		}
+	}
+
+	void setupDescTask()
+	{
+		ThreadContext& mThreadContext = *mIslandContext.mThreadContext;
+		PxSolverConstraintDesc* contactDescPtr = mThreadContext.mContactDescPtr;
+
+		//PxU32 constraintCount = mCounts.constraints + mCounts.contactManagers;
+
+		PxU32 nbIslands = mObjects.numIslands;
+		const IG::IslandId* const islandIds = mObjects.islandIds;
+
+		const IG::IslandSim& islandSim = mIslandManager.getAccurateIslandSim();
+
+		for(PxU32 i = 0; i < nbIslands; ++i)
+		{
+			const IG::Island& island = islandSim.getIsland(islandIds[i]);
+
+			IG::EdgeIndex edgeId = island.mFirstEdge[IG::Edge::eCONSTRAINT];
+
+			while(edgeId != IG_INVALID_EDGE)
+			{
+				PxSolverConstraintDesc& desc = *contactDescPtr;
+				
+				const IG::Edge& edge = islandSim.getEdge(edgeId);
+				Dy::Constraint* constraint = mIslandManager.getConstraint(edgeId);
+				mContext.setDescFromIndices(desc, edgeId, mIslandManager, mBodyRemapTable, mSolverBodyOffset);
+				desc.constraint = reinterpret_cast<PxU8*>(constraint);
+				desc.constraintLengthOver16 = DY_SC_TYPE_RB_1D;
+				contactDescPtr++;
+				edgeId = edge.mNextIslandEdge;
+			}
+
+		}
+
+#if 1
+		Ps::sort(mThreadContext.mContactDescPtr, PxU32(contactDescPtr - mThreadContext.mContactDescPtr), ConstraintLess());
+#endif
+
+
+		mThreadContext.orderedContactList.forceSize_Unsafe(0);
+		mThreadContext.orderedContactList.reserve(mIslandContext.mCounts.contactManagers);
+		mThreadContext.orderedContactList.forceSize_Unsafe(mIslandContext.mCounts.contactManagers);
+		mThreadContext.tempContactList.forceSize_Unsafe(0);
+		mThreadContext.tempContactList.reserve(mIslandContext.mCounts.contactManagers);
+		mThreadContext.tempContactList.forceSize_Unsafe(mIslandContext.mCounts.contactManagers);
+
+		const PxsIndexedContactManager** constraints = mThreadContext.orderedContactList.begin();
+
+
+		//OK, we sort the orderedContactList 
+
+		mThreadContext.compoundConstraints.forceSize_Unsafe(0);
+		if(mIslandContext.mCounts.contactManagers)
+		{
+			{
+				mThreadContext.sortIndexArray.forceSize_Unsafe(0);
+
+				PX_COMPILE_TIME_ASSERT(PxsIndexedInteraction::eBODY == 0);
+				PX_COMPILE_TIME_ASSERT(PxsIndexedInteraction::eKINEMATIC == 1);
+
+				const PxI32 offsetMap[] = {PxI32(mContext.mKinematicCount), 0};
+
+				const PxU32 totalBodies = mContext.mKinematicCount + mIslandContext.mCounts.bodies+1;
+
+				mThreadContext.sortIndexArray.reserve(totalBodies);
+				mThreadContext.sortIndexArray.forceSize_Unsafe(totalBodies);
+				PxMemZero(mThreadContext.sortIndexArray.begin(), totalBodies * 4);
+
+				//Iterate over the array based on solverBodyDatapool, creating a list of sorted constraints (in order of body pair)
+				//We only do this with contacts. It's important that this is done this way because we don't want to break our rules that all joints
+				//appear before all contacts in the constraint list otherwise we will lose all guarantees about sorting joints.
+				
+				for(PxU32 a = 0; a < mIslandContext.mCounts.contactManagers; ++a)
+				{
+					PX_ASSERT(mObjects.contactManagers[a].indexType0 != PxsIndexedInteraction::eWORLD);
+					//Index first body...
+					PxU8 indexType = mObjects.contactManagers[a].indexType0;
+					if(indexType != PxsIndexedInteraction::eARTICULATION && mObjects.contactManagers[a].indexType1 != PxsIndexedInteraction::eARTICULATION)
+					{
+						PX_ASSERT((indexType == PxsIndexedInteraction::eBODY) || (indexType == PxsIndexedInteraction::eKINEMATIC));
+
+						PxI32 index = PxI32(mObjects.contactManagers[a].solverBody0 + offsetMap[indexType]);
+						PX_ASSERT(index >= 0);
+						mThreadContext.sortIndexArray[PxU32(index)]++;
+					}
+				}
+
+				PxU32 accumulatedCount = 0;
+
+				for(PxU32 a = mThreadContext.sortIndexArray.size(); a > 0; --a)
+				{
+					PxU32 ind = a - 1;
+					PxU32 val = mThreadContext.sortIndexArray[ind];
+					mThreadContext.sortIndexArray[ind] = accumulatedCount;
+					accumulatedCount += val;
+				}
+
+				//OK, now copy across data to orderedConstraintDescs, pushing articulations to the end...
+				for(PxU32 a = 0; a < mIslandContext.mCounts.contactManagers; ++a)
+				{
+					//Index first body...
+					PxU8 indexType = mObjects.contactManagers[a].indexType0;
+					if(indexType != PxsIndexedInteraction::eARTICULATION && mObjects.contactManagers[a].indexType1 != PxsIndexedInteraction::eARTICULATION)
+					{
+						PX_ASSERT((indexType == PxsIndexedInteraction::eBODY) || (indexType == PxsIndexedInteraction::eKINEMATIC));
+
+						PxI32 index = PxI32(mObjects.contactManagers[a].solverBody0 + offsetMap[indexType]);
+						PX_ASSERT(index >= 0);
+						mThreadContext.tempContactList[mThreadContext.sortIndexArray[PxU32(index)]++] = &mObjects.contactManagers[a];
+					}
+					else
+					{
+						mThreadContext.tempContactList[accumulatedCount++] = &mObjects.contactManagers[a];
+					}
+				}
+
+				//Now do the same again with bodyB, being careful not to overwrite the joints
+				PxMemZero(mThreadContext.sortIndexArray.begin(), totalBodies * 4);
+
+
+				for(PxU32 a = 0; a < mIslandContext.mCounts.contactManagers; ++a)
+				{
+					//Index first body...
+					PxU8 indexType = mThreadContext.tempContactList[a]->indexType1;
+					if(indexType != PxsIndexedInteraction::eARTICULATION && mObjects.contactManagers[a].indexType0 != PxsIndexedInteraction::eARTICULATION)
+					{
+						PX_ASSERT((indexType == PxsIndexedInteraction::eBODY) || (indexType == PxsIndexedInteraction::eKINEMATIC) || (indexType == PxsIndexedInteraction::eWORLD));
+
+						PxI32 index = (indexType == PxsIndexedInteraction::eWORLD) ? 0 : PxI32(mThreadContext.tempContactList[a]->solverBody1 + offsetMap[indexType]);
+						PX_ASSERT(index >= 0);
+						mThreadContext.sortIndexArray[PxU32(index)]++;
+					}
+				}
+
+				accumulatedCount = 0;
+				for(PxU32 a = mThreadContext.sortIndexArray.size(); a > 0; --a)
+				{
+					PxU32 ind = a - 1;
+					PxU32 val = mThreadContext.sortIndexArray[ind];
+					mThreadContext.sortIndexArray[ind] = accumulatedCount;
+					accumulatedCount += val;
+				}
+
+				PxU32 articulationStartIndex = accumulatedCount;
+
+				//OK, now copy across data to orderedConstraintDescs, pushing articulations to the end...
+				for(PxU32 a = 0; a < mIslandContext.mCounts.contactManagers; ++a)
+				{
+					//Index first body...
+					PxU8 indexType = mThreadContext.tempContactList[a]->indexType1;
+					if(indexType != PxsIndexedInteraction::eARTICULATION && mObjects.contactManagers[a].indexType0 != PxsIndexedInteraction::eARTICULATION)
+					{
+						PX_ASSERT((indexType == PxsIndexedInteraction::eBODY) || (indexType == PxsIndexedInteraction::eKINEMATIC) || (indexType == PxsIndexedInteraction::eWORLD));
+
+						PxI32 index = (indexType == PxsIndexedInteraction::eWORLD) ? 0 : PxI32(mThreadContext.tempContactList[a]->solverBody1 + offsetMap[indexType]);
+						PX_ASSERT(index >= 0);
+						constraints[mThreadContext.sortIndexArray[PxU32(index)]++] = mThreadContext.tempContactList[a];
+					}
+					else
+					{
+						constraints[accumulatedCount++] = mThreadContext.tempContactList[a];
+					}
+				}
+
+#if 1
+				Ps::sort(constraints + articulationStartIndex, accumulatedCount - articulationStartIndex, ArticulationSortPredicate());
+#endif
+			}
+
+			mThreadContext.mStartContactDescPtr = contactDescPtr;
+
+			mThreadContext.compoundConstraints.reserve(1024);
+			mThreadContext.compoundConstraints.forceSize_Unsafe(0);
+			//mThreadContext.compoundConstraints.forceSize_Unsafe(mCounts.contactManagers);
+
+			PxSolverConstraintDesc* startDesc = contactDescPtr;
+			mContext.setDescFromIndices(*startDesc, *constraints[0], mSolverBodyOffset);
+			startDesc->constraint = reinterpret_cast<PxU8*>(constraints[0]->contactManager);
+			startDesc->constraintLengthOver16 = DY_SC_TYPE_RB_CONTACT;
+
+			PxsContactManagerOutput* startManagerOutput = &mOutputs.getContactManager(constraints[0]->contactManager->getWorkUnit().mNpIndex);
+			PxU32 contactCount = startManagerOutput->nbContacts;
+			PxU32 startIndex = 0;
+			PxU32 numHeaders = 0;
+			for(PxU32 a = 1; a < mIslandContext.mCounts.contactManagers; ++a)
+			{
+				PxSolverConstraintDesc& desc = *(contactDescPtr+1);
+				mContext.setDescFromIndices(desc, *constraints[a], mSolverBodyOffset);
+
+				PxsContactManager* manager = constraints[a]->contactManager;
+				PxsContactManagerOutput& output = mOutputs.getContactManager(manager->getWorkUnit().mNpIndex);
+
+				desc.constraint = reinterpret_cast<PxU8*>(constraints[a]->contactManager);
+				desc.constraintLengthOver16 = DY_SC_TYPE_RB_CONTACT;
+
+				if (contactCount == 0)
+				{
+					//This is the first object in the pair
+					*startDesc = *(contactDescPtr + 1);
+					startIndex = a;
+					startManagerOutput = &output;
+				}
+				
+				if(startDesc->bodyA != desc.bodyA || startDesc->bodyB != desc.bodyB 
+					|| startDesc->linkIndexA != PxSolverConstraintDesc::NO_LINK || startDesc->linkIndexB != PxSolverConstraintDesc::NO_LINK
+					|| contactCount + output.nbContacts > Gu::ContactBuffer::MAX_CONTACTS
+					|| manager->isChangeable()
+					) //If this is the first thing and no contacts...then we skip
+				{
+					PxU32 stride = a - startIndex;
+					if(contactCount > 0)
+					{
+						if(stride > 1)
+						{
+							++numHeaders;
+							CompoundContactManager& header = mThreadContext.compoundConstraints.insert();
+							header.mStartIndex = startIndex;
+							header.mStride = Ps::to16(stride);	
+							header.mReducedContactCount = Ps::to16(contactCount);
+							PxsContactManager* manager1 = constraints[startIndex]->contactManager;
+							PxcNpWorkUnit& unit = manager1->getWorkUnit();
+
+							PX_ASSERT(startManagerOutput == &mOutputs.getContactManager(unit.mNpIndex));
+
+							header.unit = &unit;
+							header.cmOutput = startManagerOutput;
+							header.originalContactPatches = startManagerOutput->contactPatches;
+							header.originalContactPoints = startManagerOutput->contactPoints;
+							header.originalContactCount = startManagerOutput->nbContacts;
+							header.originalPatchCount	= startManagerOutput->nbPatches;
+							header.originalForceBuffer = reinterpret_cast<PxReal*>(startManagerOutput->contactForces);
+							header.originalStatusFlags = startManagerOutput->statusFlag;
+						}
+						startDesc = ++contactDescPtr;
+					}
+					else
+					{
+						//Copy back next contactDescPtr
+						*startDesc = *(contactDescPtr+1);
+					}
+					contactCount = 0;
+					startIndex = a;
+					startManagerOutput = &output;
+				}
+				contactCount += output.nbContacts;
+				
+			}
+			PxU32 stride = mIslandContext.mCounts.contactManagers - startIndex;
+			if(contactCount > 0)
+			{
+				if(stride > 1)
+				{
+					++numHeaders;
+					CompoundContactManager& header = mThreadContext.compoundConstraints.insert();
+					header.mStartIndex = startIndex;
+					header.mStride = Ps::to16(stride);
+					header.mReducedContactCount = Ps::to16(contactCount);
+					PxsContactManager* manager = constraints[startIndex]->contactManager;
+					PxcNpWorkUnit& unit = manager->getWorkUnit();
+					header.unit = &unit;
+					header.cmOutput = startManagerOutput;
+					header.originalContactPatches = startManagerOutput->contactPatches;
+					header.originalContactPoints = startManagerOutput->contactPoints;
+					header.originalContactCount = startManagerOutput->nbContacts;
+					header.originalPatchCount	= startManagerOutput->nbPatches;
+					header.originalForceBuffer = reinterpret_cast<PxReal*>(startManagerOutput->contactForces);
+					header.originalStatusFlags = startManagerOutput->statusFlag;
+				}
+				contactDescPtr++;
+			}
+
+			if(numHeaders)
+			{
+				const PxU32 unrollSize = 8;
+				for(PxU32 a = 0; a < numHeaders; a+= unrollSize)
+				{
+					PxsSolverConstraintPostProcessTask* postProcessTask = PX_PLACEMENT_NEW( mContext.getTaskPool().allocate(sizeof(PxsSolverConstraintPostProcessTask)), 
+						PxsSolverConstraintPostProcessTask)(mContext, mThreadContext, mObjects, mSolverBodyOffset, a, PxMin(unrollSize, numHeaders - a), mMaterialManager,
+						mOutputs);
+					postProcessTask->setContinuation(mCont);
+					postProcessTask->removeReference();
+				}
+			}
+		}
+		mThreadContext.contactDescArraySize = PxU32(contactDescPtr - mThreadContext.contactConstraintDescArray);
+		mThreadContext.mContactDescPtr = contactDescPtr;
+	}
+
+	virtual void runInternal()
+	{
+		startTasks();
+		integrate();
+		setupDescTask();
+		articulationTask();
+	}
+
+	virtual const char* getName() const
+	{
+		return "PxsDynamics.solverStart";
+	}
+
+private:
+	DynamicsContext&			mContext;
+	IslandContext&				mIslandContext;
+	const SolverIslandObjects	mObjects;
+	const PxU32					mSolverBodyOffset;
+	const PxU32					mKinematicCount;
+	IG::SimpleIslandManager&	mIslandManager;
+	PxU32*						mBodyRemapTable;
+	PxsMaterialManager*			mMaterialManager;
+	PxsContactManagerOutputIterator& mOutputs;
+	bool						mEnhancedDeterminism;
+};
+
+class PxsSolverConstraintPartitionTask : public Cm::Task
+{
+	PxsSolverConstraintPartitionTask& operator=(const PxsSolverConstraintPartitionTask&);
+public:
+
+	PxsSolverConstraintPartitionTask(DynamicsContext& context,
+		IslandContext& islandContext,
+		const SolverIslandObjects& objects,				  
+		const PxU32 solverBodyOffset, bool enhancedDeterminism) :
+		mContext(context), 
+		mIslandContext(islandContext),
+		mObjects(objects),
+		mSolverBodyOffset(solverBodyOffset),
+		mEnhancedDeterminism(enhancedDeterminism)
+	{}
+
+	virtual void runInternal()
+	{
+
+		ThreadContext& mThreadContext = *mIslandContext.mThreadContext;
+
+		//Compact articulation pairs...
+		ArticulationSolverDesc* artics = mThreadContext.getArticulations().begin();
+
+		if(mIslandContext.mCounts.articulations)
+		{
+			PxU32 nbArticConstraints = artics[0].numInternalConstraints;
+
+			PxSolverConstraintDesc* currDesc = mThreadContext.mContactDescPtr;
+			for(PxU32 a = 1; a < mIslandContext.mCounts.articulations; ++a)
+			{
+				//Compact pairs...
+				const PxU32 nbInternalConstraints = artics[a].numInternalConstraints;
+				const PxU32 startIdx = a * DY_ARTICULATION_MAX_SIZE;
+				const PxU32 endIdx = startIdx + nbInternalConstraints;
+
+				for(PxU32 b = startIdx; b < endIdx; ++b)
+				{
+					currDesc[nbArticConstraints++] = currDesc[b];
+				}
+			}
+
+			mThreadContext.contactDescArraySize += nbArticConstraints;
+		}
+
+		PxSolverConstraintDesc* descBegin = mThreadContext.contactConstraintDescArray;
+		PxU32 descCount = mThreadContext.contactDescArraySize;
+
+		PxSolverBody* solverBodies = mContext.mSolverBodyPool.begin() + mSolverBodyOffset;
+		
+		mThreadContext.mNumDifferentBodyConstraints = descCount;
+
+		{
+			mThreadContext.mNumDifferentBodyConstraints = 0;
+			mThreadContext.mNumSelfConstraints = 0;
+			mThreadContext.mNumSelfConstraintBlocks = 0;
+			mThreadContext.mNumDifferentBodyFrictionConstraints = 0;
+			mThreadContext.mNumSelfConstraintFrictionBlocks = 0;
+			mThreadContext.mNumSelfFrictionConstraints = 0;
+
+			if(descCount > 0)
+			{
+				ConstraintPartitionArgs args;
+				args.mBodies = solverBodies;
+				args.mArticulationPtrs = artics;
+				args.mContactConstraintDescriptors = descBegin;
+				args.mNumArticulationPtrs = mThreadContext.getArticulations().size();
+				args.mNumBodies = mIslandContext.mCounts.bodies;
+				args.mNumContactConstraintDescriptors = descCount;
+				args.mOrderedContactConstraintDescriptors = mThreadContext.orderedContactConstraints;
+				args.mTempContactConstraintDescriptors = mThreadContext.tempConstraintDescArray;
+				args.mNumDifferentBodyConstraints = args.mNumSelfConstraints = args.mNumSelfConstraintBlocks = 0;
+				args.mConstraintsPerPartition = &mThreadContext.mConstraintsPerPartition;
+				args.mBitField = &mThreadContext.mPartitionNormalizationBitmap;
+				args.enhancedDeterminism = mEnhancedDeterminism;
+				
+				mThreadContext.mMaxPartitions = partitionContactConstraints(args);
+				mThreadContext.mNumDifferentBodyConstraints = args.mNumDifferentBodyConstraints;
+				mThreadContext.mNumSelfConstraints = args.mNumSelfConstraints;
+				mThreadContext.mNumSelfConstraintBlocks = args.mNumSelfConstraintBlocks;
+			}
+			else
+			{
+				PxMemZero(mThreadContext.mConstraintsPerPartition.begin(), sizeof(PxU32)*mThreadContext.mConstraintsPerPartition.capacity());
+			}
+
+			PX_ASSERT((mThreadContext.mNumDifferentBodyConstraints + mThreadContext.mNumSelfConstraints) == descCount);
+		}
+
+	}
+
+	virtual const char* getName() const { return "PxsDynamics.solverConstraintPartition"; }
+
+	DynamicsContext&			mContext;
+	IslandContext&			mIslandContext;
+	const SolverIslandObjects		mObjects;
+	PxU32						mSolverBodyOffset;
+	bool						mEnhancedDeterminism;
+};
+
+
+class PxsSolverSetupSolveTask : public Cm::Task
+{
+	PxsSolverSetupSolveTask& operator=(const PxsSolverSetupSolveTask&);
+public:
+
+	PxsSolverSetupSolveTask(
+		DynamicsContext& context,
+		IslandContext& islandContext,
+		const SolverIslandObjects& objects,				  
+		const PxU32 solverBodyOffset,
+		IG::IslandSim& islandSim) :
+		mContext(context), 
+		mIslandContext(islandContext),
+		mObjects(objects),
+		mSolverBodyOffset(solverBodyOffset),
+		mIslandSim(islandSim)
+	{}
+
+
+	virtual void runInternal()
+	{
+		ThreadContext& mThreadContext = *mIslandContext.mThreadContext;
+
+		PxSolverConstraintDesc* contactDescBegin = mThreadContext.orderedContactConstraints;
+		PxSolverConstraintDesc* contactDescPtr = mThreadContext.orderedContactConstraints;
+
+		PxSolverBody* solverBodies = mContext.mSolverBodyPool.begin() + mSolverBodyOffset;
+		PxSolverBodyData* solverBodyDatas = mContext.mSolverBodyDataPool.begin();
+
+		PxU32 frictionDescCount = mThreadContext.mNumDifferentBodyFrictionConstraints;
+
+		PxU32 j = 0, i = 0;
+		
+		//On PS3, self-constraints will be bumped to the end of the constraint list
+		//and processed separately. On PC/360, they will be mixed in the array and
+		//classed as "different body" constraints regardless of the fact that they're self-constraints.
+		//PxU32 numBatches = mThreadContext.numDifferentBodyBatchHeaders;
+		// TODO: maybe replace with non-null joints from end of the array
+
+		PxU32 numBatches = 0;
+
+		PxU32 currIndex = 0;
+		for(PxU32 a = 0; a < mThreadContext.mConstraintsPerPartition.size(); ++a)
+		{
+			PxU32 endIndex = currIndex + mThreadContext.mConstraintsPerPartition[a];
+
+			PxU32 numBatchesInPartition = 0;
+			for(PxU32 b = currIndex; b < endIndex; ++b)
+			{
+				PxConstraintBatchHeader& _header = mThreadContext.contactConstraintBatchHeaders[b];
+				PxU16 stride = _header.mStride, newStride = _header.mStride;
+				PxU32 startIndex = j;
+				for(PxU16 c = 0; c < stride; ++c)
+				{
+					if(getConstraintLength(contactDescBegin[i]) == 0)
+					{
+						newStride--;
+						i++;
+					}
+					else
+					{
+						if(i!=j)
+							contactDescBegin[j] = contactDescBegin[i];
+						i++;
+						j++;
+						contactDescPtr++;
+					}
+				}
+
+				if(newStride != 0)
+				{
+					mThreadContext.contactConstraintBatchHeaders[numBatches].mStartIndex = startIndex;
+					mThreadContext.contactConstraintBatchHeaders[numBatches].mStride = newStride;
+					PxU8 type = *contactDescBegin[startIndex].constraint;
+					if(type == DY_SC_TYPE_STATIC_CONTACT)
+					{
+						//Check if any block of constraints is classified as type static (single) contact constraint.
+						//If they are, iterate over all constraints grouped with it and switch to "dynamic" contact constraint
+						//type if there's a dynamic contact constraint in the group.
+						for(PxU32 c = 1; c < newStride; ++c)
+						{
+							if(*contactDescBegin[startIndex+c].constraint == DY_SC_TYPE_RB_CONTACT)
+							{
+								type = DY_SC_TYPE_RB_CONTACT;
+							}
+						}
+					}
+
+					mThreadContext.contactConstraintBatchHeaders[numBatches].mConstraintType = type;
+					numBatches++;
+					numBatchesInPartition++;
+				}
+			}
+			PxU32 numHeaders = numBatchesInPartition;
+			currIndex += mThreadContext.mConstraintsPerPartition[a];
+			mThreadContext.mConstraintsPerPartition[a] = numHeaders;
+		}
+
+		PxU32 contactDescCount = PxU32(contactDescPtr - contactDescBegin);
+
+		mThreadContext.mNumDifferentBodyConstraints = contactDescCount;
+
+		PxU32 numSelfConstraintBlocks = mThreadContext.mNumSelfConstraintBlocks;
+
+		//Remap self constraint array. Self-constraint blocks exists on PS3 as an optimization for SPU solver.
+		for(PxU32 a = 0; a < numSelfConstraintBlocks; ++a)
+		{
+			PX_ASSERT(mThreadContext.mSelfConstraintBlocks[a].startId == i);
+			PxU32 origNumSelfConstraints = mThreadContext.mSelfConstraintBlocks[a].numSelfConstraints;
+			PxU32 startId = j;
+
+			for(PxU32 b = 0; b < origNumSelfConstraints; ++b)
+			{
+				PxSolverConstraintDesc& desc = contactDescBegin[i];
+
+				if(getConstraintLength(desc))
+				{
+					PxConstraintBatchHeader& header = mThreadContext.contactConstraintBatchHeaders[numBatches++];
+					header.mStride = 1;
+					header.mStartIndex = j;
+					header.mConstraintType = *desc.constraint;
+					if(i != j)
+						contactDescBegin[j] = contactDescBegin[i];
+					j++;
+				}
+				i++;
+			}
+			mThreadContext.mSelfConstraintBlocks[a].startId = startId;
+			mThreadContext.mSelfConstraintBlocks[a].numSelfConstraints = j - startId;
+		}
+
+		mThreadContext.numContactConstraintBatches = numBatches;
+		mThreadContext.mNumSelfConstraints = j - contactDescCount; //self constraint count
+		contactDescCount = j;
+		mThreadContext.mOrderedContactDescCount = j;
+
+		//Now do the friction constraints if we're not using the sticky model
+		if(mContext.getFrictionType() != PxFrictionType::ePATCH)
+		{
+			PxSolverConstraintDesc* frictionDescBegin = mThreadContext.frictionConstraintDescArray.begin();
+			PxSolverConstraintDesc* frictionDescPtr = frictionDescBegin;
+
+			Ps::Array<PxConstraintBatchHeader>& frictionHeaderArray = mThreadContext.frictionConstraintBatchHeaders;
+			frictionHeaderArray.forceSize_Unsafe(0);
+			frictionHeaderArray.reserve(mThreadContext.numContactConstraintBatches);
+			PxConstraintBatchHeader* headers = frictionHeaderArray.begin();
+
+			Ps::Array<PxU32>& constraintsPerPartition = mThreadContext.mConstraintsPerPartition;
+			Ps::Array<PxU32>& frictionConstraintsPerPartition = mThreadContext.mFrictionConstraintsPerPartition;
+			frictionConstraintsPerPartition.forceSize_Unsafe(0);
+			frictionConstraintsPerPartition.reserve(constraintsPerPartition.capacity());
+			
+
+			PxU32 fricI = 0;
+			PxU32 startIndex = 0;
+			PxU32 fricHeaders = 0;
+			for(PxU32 k = 0; k < constraintsPerPartition.size(); ++k)
+			{
+				PxU32 numBatchesInK = constraintsPerPartition[k];
+				PxU32 endIndex = startIndex + numBatchesInK;
+
+				PxU32 startFricH = fricHeaders;
+
+				for(PxU32 a = startIndex; a < endIndex; ++a)
+				{
+					PxConstraintBatchHeader& _header = mThreadContext.contactConstraintBatchHeaders[a];
+					PxU16 stride = _header.mStride;
+					if(_header.mConstraintType == DY_SC_TYPE_RB_CONTACT || _header.mConstraintType == DY_SC_TYPE_EXT_CONTACT || 
+						_header.mConstraintType == DY_SC_TYPE_STATIC_CONTACT)
+					{
+						PxU8 type = 0;
+						//Extract friction from this constraint
+						for(PxU16 b = 0; b < stride; ++b)
+						{
+							//create the headers...
+							PxSolverConstraintDesc& desc = contactDescBegin[_header.mStartIndex + b];
+							PX_ASSERT(desc.constraint);
+							SolverContactCoulombHeader* header = reinterpret_cast<SolverContactCoulombHeader*>(desc.constraint);
+							PxU32 frictionOffset = header->frictionOffset;
+							PxU8* PX_RESTRICT constraint =  reinterpret_cast<PxU8*>(header) + frictionOffset;
+							const PxU32 origLength = getConstraintLength(desc);
+							const PxU32 length = (origLength - frictionOffset);
+
+							setConstraintLength(*frictionDescPtr, length);
+							frictionDescPtr->constraint	= constraint;
+							frictionDescPtr->bodyA = desc.bodyA;
+							frictionDescPtr->bodyB = desc.bodyB;
+							frictionDescPtr->bodyADataIndex = desc.bodyADataIndex;
+							frictionDescPtr->bodyBDataIndex = desc.bodyBDataIndex;
+							frictionDescPtr->linkIndexA = desc.linkIndexA;
+							frictionDescPtr->linkIndexB = desc.linkIndexB;
+							frictionDescPtr->writeBack = NULL;
+							frictionDescPtr->writeBackLengthOver4 = 0;
+							type = *constraint;
+							frictionDescPtr++;
+						}
+						headers->mStartIndex = fricI;
+						headers->mStride = stride;
+						headers->mConstraintType = type;
+						headers++;
+						fricHeaders++;
+						fricI += stride;
+					}
+					else if(_header.mConstraintType == DY_SC_TYPE_BLOCK_RB_CONTACT || _header.mConstraintType == DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT)
+					{
+						//KS - TODO - Extract block of 4 contacts from this constraint. This isn't implemented yet for coulomb friction model
+						PX_ASSERT(contactDescBegin[_header.mStartIndex].constraint);
+						SolverContactCoulombHeader4* head = reinterpret_cast<SolverContactCoulombHeader4*>(contactDescBegin[_header.mStartIndex].constraint);
+						PxU32 frictionOffset = head->frictionOffset;
+						PxU8* PX_RESTRICT constraint =  reinterpret_cast<PxU8*>(head) + frictionOffset;
+						const PxU32 origLength = getConstraintLength(contactDescBegin[_header.mStartIndex]);
+						const PxU32 length = (origLength - frictionOffset);
+						PxU8 type = *constraint;
+						PX_ASSERT(type == DY_SC_TYPE_BLOCK_FRICTION || type == DY_SC_TYPE_BLOCK_STATIC_FRICTION);
+						for(PxU32 b = 0; b < 4; ++b)
+						{
+							PxSolverConstraintDesc& desc = contactDescBegin[_header.mStartIndex+b];
+							setConstraintLength(*frictionDescPtr, length);
+							frictionDescPtr->constraint	= constraint;
+							frictionDescPtr->bodyA = desc.bodyA;
+							frictionDescPtr->bodyB = desc.bodyB;
+							frictionDescPtr->bodyADataIndex = desc.bodyADataIndex;
+							frictionDescPtr->bodyBDataIndex = desc.bodyBDataIndex;
+							frictionDescPtr->linkIndexA = desc.linkIndexA;
+							frictionDescPtr->linkIndexB = desc.linkIndexB;
+							frictionDescPtr->writeBack = NULL;
+							frictionDescPtr->writeBackLengthOver4 = 0;
+							frictionDescPtr++;
+						}
+						headers->mStartIndex = fricI;
+						headers->mStride = stride;
+						headers->mConstraintType = type;
+						headers++;
+						fricHeaders++;
+						fricI += stride;
+					}
+				}
+				startIndex += numBatchesInK;
+				if(startFricH < fricHeaders)
+				{
+					frictionConstraintsPerPartition.pushBack(fricHeaders - startFricH);
+				}
+			}
+		
+
+			frictionDescCount = PxU32(frictionDescPtr - frictionDescBegin);
+			
+			mThreadContext.mNumDifferentBodyFrictionConstraints = frictionDescCount;
+
+			frictionHeaderArray.forceSize_Unsafe(PxU32(headers - frictionHeaderArray.begin()));
+
+			mThreadContext.mNumSelfFrictionConstraints = fricI - frictionDescCount; //self constraint count
+			mThreadContext.mNumDifferentBodyFrictionConstraints = frictionDescCount;
+			frictionDescCount = fricI;
+			mThreadContext.mOrderedFrictionDescCount = frictionDescCount;
+
+
+		}
+
+		{
+			{
+				PX_PROFILE_ZONE("Dynamics.solver", mContext.getContextId());
+
+				PxSolverConstraintDesc* contactDescs = mThreadContext.orderedContactConstraints;
+				PxSolverConstraintDesc* frictionDescs = mThreadContext.frictionConstraintDescArray.begin();
+
+				PxI32* thresholdPairsOut = &mContext.mThresholdStreamOut;
+
+				SolverIslandParams& params = *reinterpret_cast<SolverIslandParams*>(mContext.getTaskPool().allocate(sizeof(SolverIslandParams)));
+				params.positionIterations = mThreadContext.mMaxSolverPositionIterations;
+				params.velocityIterations = mThreadContext.mMaxSolverVelocityIterations;
+				params.bodyListStart = solverBodies;
+				params.bodyDataList = solverBodyDatas;
+				params.solverBodyOffset = mSolverBodyOffset;
+				params.bodyListSize = mIslandContext.mCounts.bodies;
+				params.articulationListStart = mThreadContext.getArticulations().begin();
+				params.articulationListSize = mThreadContext.getArticulations().size();
+				params.constraintList = contactDescs;
+				params.constraintIndex = 0;
+				params.constraintIndex2 = 0;
+				params.bodyListIndex = 0;
+				params.bodyListIndex2 = 0;
+				params.bodyIntegrationListIndex = 0;
+				params.thresholdStream = mContext.getThresholdStream().begin();
+				params.thresholdStreamLength = mContext.getThresholdStream().size();
+				params.outThresholdPairs = thresholdPairsOut;
+				params.motionVelocityArray = mThreadContext.motionVelocityArray;
+				params.bodyArray = mThreadContext.mBodyCoreArray;
+				params.numObjectsIntegrated = 0;
+				params.constraintBatchHeaders = mThreadContext.contactConstraintBatchHeaders;
+				params.numConstraintHeaders = mThreadContext.numContactConstraintBatches;
+				params.headersPerPartition = mThreadContext.mConstraintsPerPartition.begin();
+				params.nbPartitions = mThreadContext.mConstraintsPerPartition.size();
+				params.rigidBodies = const_cast<PxsRigidBody**>(mObjects.bodies);
+				params.frictionHeadersPerPartition = mThreadContext.mFrictionConstraintsPerPartition.begin();
+				params.nbFrictionPartitions = mThreadContext.mFrictionConstraintsPerPartition.size();
+				params.frictionConstraintBatches = mThreadContext.frictionConstraintBatchHeaders.begin();
+				params.numFrictionConstraintHeaders = mThreadContext.frictionConstraintBatchHeaders.size();
+				params.frictionConstraintIndex = 0;
+				params.frictionConstraintList = frictionDescs;
+
+				const PxU32 unrollSize = 8;
+				const PxU32 denom = PxMax(1u, (mThreadContext.mMaxPartitions*unrollSize));
+				const PxU32 MaxTasks = getTaskManager()->getCpuDispatcher()->getWorkerCount();
+				const PxU32 idealThreads = mThreadContext.numContactConstraintBatches/denom;
+				const PxU32 numTasks = PxMax(1u, PxMin(idealThreads, MaxTasks));
+				
+				if(numTasks > 1)
+				{
+					const PxU32 idealBatchSize = PxMax(unrollSize, idealThreads*unrollSize/(numTasks*2));
+
+					params.batchSize = idealBatchSize; //assigning ideal batch size for the solver to grab work at. Only needed by the multi-threaded island solver.
+
+					for(PxU32 a = 1; a < numTasks; ++a)
+					{
+						void* tsk = mContext.getTaskPool().allocate(sizeof(PxsParallelSolverTask));
+						PxsParallelSolverTask* pTask = PX_PLACEMENT_NEW(tsk, PxsParallelSolverTask)(
+							params, mContext, mContext.getFrictionType(), mIslandSim);
+
+						//Force to complete before merge task!
+						pTask->setContinuation(mCont);
+						
+						pTask->removeReference();
+					}
+
+					//Avoid kicking off one parallel task when we can do the work inline in this function
+					{						
+						PX_PROFILE_ZONE("Dynamics.parallelSolve", mContext.getContextId());
+
+						solveParallel(mContext, params, mIslandSim);
+					}
+					const PxI32 numBodiesPlusArtics = PxI32( mIslandContext.mCounts.bodies + mIslandContext.mCounts.articulations );
+
+					PxI32* numObjectsIntegrated = &params.numObjectsIntegrated;
+
+					WAIT_FOR_PROGRESS_NO_TIMER(numObjectsIntegrated, numBodiesPlusArtics);
+
+				}
+				else
+				{
+					
+					//Only one task - a small island so do a sequential solve (avoid the atomic overheads)
+					solveVBlock(mContext.mSolverCore[mContext.getFrictionType()], params);
+
+					const PxU32 bodyCountMin1 = mIslandContext.mCounts.bodies - 1u;
+					PxSolverBodyData* solverBodyData2 = solverBodyDatas + mSolverBodyOffset + 1;
+					for(PxU32 k=0; k < mIslandContext.mCounts.bodies; k++)
+					{
+						const PxU32 prefetchAddress = PxMin(k+4, bodyCountMin1);
+						Ps::prefetchLine(mThreadContext.mBodyCoreArray[prefetchAddress]);
+						Ps::prefetchLine(&mThreadContext.motionVelocityArray[k], 128);
+						Ps::prefetchLine(&mThreadContext.mBodyCoreArray[prefetchAddress], 128);
+						Ps::prefetchLine(&mObjects.bodies[prefetchAddress]);
+
+						PxSolverBodyData& solverBodyData = solverBodyData2[k];
+
+						integrateCore(mThreadContext.motionVelocityArray[k].linear, mThreadContext.motionVelocityArray[k].angular,
+							solverBodies[k], solverBodyData, mContext.mDt);
+
+						PxsRigidBody& rBody = *mObjects.bodies[k];
+						PxsBodyCore& core = rBody.getCore();
+						rBody.mLastTransform = core.body2World;
+						core.body2World = solverBodyData.body2World;
+						core.linearVelocity = solverBodyData.linearVelocity;
+						core.angularVelocity = solverBodyData.angularVelocity;
+
+
+						bool hasStaticTouch = mIslandSim.getIslandStaticTouchCount(IG::NodeIndex(solverBodyData.nodeIndex)) != 0;
+						sleepCheck(const_cast<PxsRigidBody*>(mObjects.bodies[k]), mContext.mDt, mContext.mInvDt, mContext.mEnableStabilization, mContext.mUseAdaptiveForce, mThreadContext.motionVelocityArray[k],
+							hasStaticTouch);
+					}
+
+					for(PxU32 cnt=0;cnt<mIslandContext.mCounts.articulations;cnt++)
+					{
+						ArticulationSolverDesc &d = mThreadContext.getArticulations()[cnt];
+						PX_PROFILE_ZONE("Articulations.integrate", mContext.getContextId());
+
+						ArticulationPImpl::updateBodies(d, mContext.getDt());
+					}
+				}
+			}
+		}
+	}
+
+	virtual const char* getName() const { return "PxsDynamics.solverSetupSolve"; }
+
+	DynamicsContext&			mContext;
+	IslandContext&				mIslandContext;
+	const SolverIslandObjects	mObjects;
+	PxU32						mSolverBodyOffset;
+	IG::IslandSim&				mIslandSim;
+};
+
+class PxsSolverEndTask : public Cm::Task
+{
+	PxsSolverEndTask& operator=(const PxsSolverEndTask&);
+public:
+
+	PxsSolverEndTask(DynamicsContext& context,
+		IslandContext& islandContext,
+		const SolverIslandObjects& objects,				  
+		const PxU32 solverBodyOffset,
+		PxsContactManagerOutputIterator& cmOutputs) :
+		mContext			(context), 
+		mIslandContext		(islandContext),
+		mObjects			(objects),
+		mSolverBodyOffset	(solverBodyOffset),
+		mOutputs			(cmOutputs)
+	{}
+
+	virtual void runInternal()
+	{		
+		ThreadContext& mThreadContext = *mIslandContext.mThreadContext;
+#if PX_ENABLE_SIM_STATS
+		mThreadContext.getSimStats().numAxisSolverConstraints += mThreadContext.mAxisConstraintCount;
+#endif
+		//Patch up the contact managers (TODO - fix up force writeback)
+		PxU32 numCompoundConstraints = mThreadContext.compoundConstraints.size();
+		for(PxU32 i = 0; i < numCompoundConstraints; ++i)
+		{
+			CompoundContactManager& manager = mThreadContext.compoundConstraints[i];
+			PxsContactManagerOutput* cmOutput = manager.cmOutput;
+
+			PxReal* contactForces = reinterpret_cast<PxReal*>(cmOutput->contactForces);
+			PxU32 contactCount = cmOutput->nbContacts;
+
+			cmOutput->contactPatches = manager.originalContactPatches;
+			cmOutput->contactPoints = manager.originalContactPoints;
+			cmOutput->nbContacts = manager.originalContactCount;
+			cmOutput->nbPatches = manager.originalPatchCount;
+			cmOutput->statusFlag = manager.originalStatusFlags;
+			cmOutput->contactForces = manager.originalForceBuffer;
+			
+			for(PxU32 a = 1; a < manager.mStride; ++a)
+			{
+				PxsContactManager* pManager = mThreadContext.orderedContactList[manager.mStartIndex + a]->contactManager;
+				pManager->getWorkUnit().frictionDataPtr = manager.unit->frictionDataPtr;
+				pManager->getWorkUnit().frictionPatchCount = manager.unit->frictionPatchCount;
+				//pManager->getWorkUnit().prevFrictionPatchCount = manager.unit->prevFrictionPatchCount;
+			}
+
+			//This is a stride-based contact force writer. The assumption is that we may have skipped certain unimportant contacts reported by the 
+			//discrete narrow phase
+			if(contactForces)
+			{
+				PxU32 currentContactIndex = 0;
+				PxU32 currentManagerIndex = manager.mStartIndex;
+				PxU32 currentManagerContactIndex = 0;
+
+				for(PxU32 a = 0; a < contactCount; ++a)
+				{
+					PxU32 index = manager.forceBufferList[a];
+					PxsContactManager* pManager = mThreadContext.orderedContactList[currentManagerIndex]->contactManager;
+					PxsContactManagerOutput* output = &mOutputs.getContactManager(pManager->getWorkUnit().mNpIndex);
+					while(currentContactIndex < index || output->nbContacts == 0)
+					{
+						//Step forwards...first in this manager...
+						
+						PxU32 numToStep = PxMin(index - currentContactIndex, PxU32(output->nbContacts) - currentManagerContactIndex);
+						currentContactIndex += numToStep;
+						currentManagerContactIndex += numToStep;
+						if(currentManagerContactIndex == output->nbContacts)
+						{
+							currentManagerIndex++;
+							currentManagerContactIndex = 0;
+							pManager = mThreadContext.orderedContactList[currentManagerIndex]->contactManager;
+							output = &mOutputs.getContactManager(pManager->getWorkUnit().mNpIndex);
+						}
+					}
+					if(output->nbContacts > 0 && output->contactForces)
+						output->contactForces[currentManagerContactIndex] = contactForces[a];
+				}
+			}
+		}
+
+		mThreadContext.compoundConstraints.forceSize_Unsafe(0);
+
+		mThreadContext.mConstraintBlockManager.reset();
+
+		mContext.putThreadContext(&mThreadContext);
+	}
+
+
+	virtual const char* getName() const
+	{
+		return "PxsDynamics.solverEnd";
+	}
+
+	DynamicsContext&					mContext;	
+	IslandContext&						mIslandContext;
+	const SolverIslandObjects			mObjects;
+	const PxU32							mSolverBodyOffset;
+	PxsContactManagerOutputIterator&	mOutputs;
+};
+
+class PxsSolverCreateFinalizeConstraintsTask : public Cm::Task
+{
+	PxsSolverCreateFinalizeConstraintsTask& operator=(const PxsSolverCreateFinalizeConstraintsTask&);
+public:
+
+	PxsSolverCreateFinalizeConstraintsTask(
+		DynamicsContext& context,
+		IslandContext& islandContext,
+		PxU32 solverDataOffset,
+		PxsContactManagerOutputIterator& outputs,
+		bool enhancedDeterminism) : 
+		mContext				(context),
+		mIslandContext			(islandContext),
+		mSolverDataOffset		(solverDataOffset),
+		mOutputs				(outputs),
+		mEnhancedDeterminism	(enhancedDeterminism)
+	{
+	}
+	
+	virtual void runInternal();
+
+	virtual const char* getName() const { return "PxsDynamics.solverCreateFinalizeConstraints"; }
+
+	DynamicsContext&					mContext;
+	IslandContext&						mIslandContext;
+	PxU32								mSolverDataOffset;
+	PxsContactManagerOutputIterator&	mOutputs;
+	bool								mEnhancedDeterminism;
+};
+
+
+// helper function to join two tasks together and ensure ref counts are correct
+void chainTasks(PxLightCpuTask* first, PxLightCpuTask* next)
+{
+	first->setContinuation(next);
+	next->removeReference();
+}
+
+PxBaseTask* createSolverTaskChain(DynamicsContext& dynamicContext,
+										const SolverIslandObjects& objects,				  
+										const PxsIslandIndices& counts,
+										const PxU32 solverBodyOffset, 
+										IG::SimpleIslandManager& islandManager, 
+										PxU32* bodyRemapTable, PxsMaterialManager* materialManager, PxBaseTask* continuation,
+										PxsContactManagerOutputIterator& iterator, bool useEnhancedDeterminism)
+{
+	Cm::FlushPool& taskPool =  dynamicContext.getTaskPool();
+
+	taskPool.lock();
+
+
+	IslandContext* islandContext = reinterpret_cast<IslandContext*>(taskPool.allocate(sizeof(IslandContext)));
+	islandContext->mThreadContext = NULL;
+	islandContext->mCounts = counts;
+
+
+	// create lead task
+	PxsSolverStartTask* startTask = PX_PLACEMENT_NEW(taskPool.allocateNotThreadSafe(sizeof(PxsSolverStartTask)), PxsSolverStartTask)(dynamicContext, *islandContext, objects, solverBodyOffset, dynamicContext.getKinematicCount(), 
+		islandManager, bodyRemapTable, materialManager, iterator, useEnhancedDeterminism);
+	PxsSolverEndTask* endTask = PX_PLACEMENT_NEW(taskPool.allocateNotThreadSafe(sizeof(PxsSolverEndTask)), PxsSolverEndTask)(dynamicContext, *islandContext, objects, solverBodyOffset, iterator);	
+
+
+	PxsSolverCreateFinalizeConstraintsTask* createFinalizeConstraintsTask = PX_PLACEMENT_NEW(taskPool.allocateNotThreadSafe(sizeof(PxsSolverCreateFinalizeConstraintsTask)), PxsSolverCreateFinalizeConstraintsTask)(dynamicContext, *islandContext, solverBodyOffset, iterator, useEnhancedDeterminism);
+	PxsSolverSetupSolveTask* setupSolveTask = PX_PLACEMENT_NEW(taskPool.allocateNotThreadSafe(sizeof(PxsSolverSetupSolveTask)), PxsSolverSetupSolveTask)(dynamicContext, *islandContext, objects, solverBodyOffset, islandManager.getAccurateIslandSim());
+
+	PxsSolverConstraintPartitionTask* partitionConstraintsTask = PX_PLACEMENT_NEW(taskPool.allocateNotThreadSafe(sizeof(PxsSolverConstraintPartitionTask)), PxsSolverConstraintPartitionTask)(dynamicContext, *islandContext, objects, solverBodyOffset, useEnhancedDeterminism);
+
+	endTask->setContinuation(continuation);
+
+	// set up task chain in reverse order
+	chainTasks(setupSolveTask, endTask);
+	chainTasks(createFinalizeConstraintsTask, setupSolveTask);
+	chainTasks(partitionConstraintsTask, createFinalizeConstraintsTask);
+	chainTasks(startTask, partitionConstraintsTask);
+
+	taskPool.unlock();
+
+	return startTask;
+}
+
+
+void DynamicsContext::update(IG::SimpleIslandManager& simpleIslandManager, PxBaseTask* /*continuation*/, PxBaseTask* lostTouchTask,
+							 PxsContactManager** /*foundPatchManagers*/, PxU32 /*nbFoundPatchManagers*/, 
+							 PxsContactManager** /*lostPatchManagers*/, PxU32 /*nbLostPatchManagers*/,
+							 PxU32 /*maxPatchesPerCM*/,
+							 PxsContactManagerOutputIterator& iterator,
+							 PxsContactManagerOutput*,
+							 const PxReal dt, const PxVec3& gravity, const PxU32 /*bitMapWordCounts*/)
+{		
+	PX_PROFILE_ZONE("Dynamics.solverQueueTasks", mContextID);
+
+	PX_UNUSED(simpleIslandManager);
+
+	mOutputIterator = iterator;
+
+	mDt = dt;
+	mInvDt = dt == 0.0f ? 0.0f : 1.0f/dt;
+	mGravity = gravity;
+
+	const IG::IslandSim& islandSim = simpleIslandManager.getAccurateIslandSim();
+
+	const PxU32 islandCount = islandSim.getNbActiveIslands();
+
+	const PxU32 activatedContactCount = islandSim.getNbActivatedEdges(IG::Edge::eCONTACT_MANAGER);
+	const IG::EdgeIndex* const activatingEdges = islandSim.getActivatedEdges(IG::Edge::eCONTACT_MANAGER);
+
+	for(PxU32 a = 0; a < activatedContactCount; ++a)
+	{
+		PxsContactManager* cm = simpleIslandManager.getContactManager(activatingEdges[a]);
+		if(cm)
+		{
+			cm->getWorkUnit().frictionPatchCount = 0; //KS - zero the friction patch count on any activating edges
+		}
+	}
+
+#if PX_ENABLE_SIM_STATS
+	if(islandCount > 0)
+	{
+		mSimStats.mNbActiveKinematicBodies = islandSim.getNbActiveKinematics();
+		mSimStats.mNbActiveDynamicBodies = islandSim.getNbActiveNodes(IG::Node::eRIGID_BODY_TYPE);	
+		mSimStats.mNbActiveConstraints = islandSim.getNbActiveEdges(IG::Edge::eCONSTRAINT);
+	}
+	else
+	{
+		mSimStats.mNbActiveKinematicBodies = islandSim.getNbActiveKinematics();
+		mSimStats.mNbActiveDynamicBodies = 0;	
+		mSimStats.mNbActiveConstraints = 0;
+	}
+#endif
+
+	mThresholdStreamOut = 0;
+
+	resetThreadContexts();
+
+	//If there is no work to do then we can do nothing at all.
+	if(0 == islandCount)
+	{
+		return;
+	}
+
+	//KS - test that world solver body's velocities are finite and 0, then set it to 0.
+	//Technically, the velocity should always be 0 but can be stomped if a NAN creeps into the simulation.
+	PX_ASSERT(mWorldSolverBody.linearVelocity == PxVec3(0.f));
+	PX_ASSERT(mWorldSolverBody.angularState == PxVec3(0.f));
+	PX_ASSERT(mWorldSolverBody.linearVelocity.isFinite());
+	PX_ASSERT(mWorldSolverBody.angularState.isFinite());
+
+	mWorldSolverBody.linearVelocity = mWorldSolverBody.angularState = PxVec3(0.f);
+
+	const PxU32 kinematicCount = islandSim.getNbActiveKinematics();
+	const IG::NodeIndex* const kinematicIndices = islandSim.getActiveKinematics();
+	mKinematicCount = kinematicCount;
+
+	const PxU32 bodyCount = islandSim.getNbActiveNodes(IG::Node::eRIGID_BODY_TYPE);
+
+	PxU32 numArtics = islandSim.getNbActiveNodes(IG::Node::eARTICULATION_TYPE);
+
+	{
+		if(kinematicCount + bodyCount > mSolverBodyPool.capacity())
+		{
+			mSolverBodyPool.reserve((kinematicCount + bodyCount + 31) & ~31); // pad out to 32 * 128 = 4k to prevent alloc churn
+			mSolverBodyDataPool.reserve((kinematicCount + bodyCount + 31 + 1) & ~31); // pad out to 32 * 128 = 4k to prevent alloc churn
+			mSolverBodyRemapTable.reserve((kinematicCount + bodyCount + 31 + 1) & ~31);
+		}
+
+		{
+			PxSolverBody emptySolverBody;
+			PxMemZero(&emptySolverBody, sizeof(PxSolverBody));
+			mSolverBodyPool.resize(kinematicCount + bodyCount, emptySolverBody);
+			PxSolverBodyData emptySolverBodyData;
+			PxMemZero(&emptySolverBodyData, sizeof(PxSolverBodyData));
+			mSolverBodyDataPool.resize(kinematicCount + bodyCount + 1, emptySolverBodyData);
+			mSolverBodyRemapTable.resize(bodyCount);
+		}
+
+		// integrate and copy all the kinematics - overkill, since not all kinematics
+		// need solver bodies
+
+		mSolverBodyDataPool[0] = mWorldSolverBodyData;
+
+
+		{			
+			PX_PROFILE_ZONE("Dynamics.updateKinematics", mContextID);
+			PxMemZero(mSolverBodyPool.begin(), kinematicCount*sizeof(PxSolverBody));
+			for(PxU32 i=0;i<kinematicCount;i++)
+			{
+				PxsRigidBody* rigidBody = islandSim.getRigidBody(kinematicIndices[i]);
+				const PxsBodyCore& core = rigidBody->getCore();
+				copyToSolverBodyData(core.linearVelocity, core.angularVelocity, core.inverseMass, core.inverseInertia, core.body2World, core.maxPenBias, 
+					core.maxContactImpulse, kinematicIndices[i].index(), core.contactReportThreshold, mSolverBodyDataPool[i + 1], core.lockFlags);
+				rigidBody->saveLastCCDTransform();
+				// Only really necessary for PS3 at the moment (for the cross island parallel constraint solver
+				// but we might switch to the same on other platforms)
+				mSolverBodyPool[i].solverProgress=MAX_PERMITTED_SOLVER_PROGRESS;
+				mSolverBodyPool[i].maxSolverNormalProgress=MAX_PERMITTED_SOLVER_PROGRESS;
+				mSolverBodyPool[i].maxSolverFrictionProgress=MAX_PERMITTED_SOLVER_PROGRESS;
+			}
+		}
+	}
+
+	PxU32 solverBatchMax = mSolverBatchSize;
+	PxU32 articulationBatchMax = 2;
+	PxU32 minimumConstraintCount = 1;
+
+
+	//Resize arrays of solver constraints...
+	PxU32 numArticulationConstraints=numArtics*Dy::DY_ARTICULATION_MAX_SIZE; //Just allocate enough memory to fit worst-case maximum size articulations...
+
+	const PxU32 nbActiveContactManagers = islandSim.getNbActiveEdges(IG::Edge::eCONTACT_MANAGER);
+	const PxU32 nbActiveConstraints = islandSim.getNbActiveEdges(IG::Edge::eCONSTRAINT);
+
+	PxU32 totalConstraintCount = nbActiveConstraints + nbActiveContactManagers + numArticulationConstraints;
+
+	mSolverConstraintDescPool.forceSize_Unsafe(0);
+	mSolverConstraintDescPool.reserve((totalConstraintCount + 63) & (~63));
+	mSolverConstraintDescPool.forceSize_Unsafe(totalConstraintCount);
+
+	mOrderedSolverConstraintDescPool.forceSize_Unsafe(0);
+	mOrderedSolverConstraintDescPool.reserve((totalConstraintCount + 63) & (~63));
+	mOrderedSolverConstraintDescPool.forceSize_Unsafe(totalConstraintCount);
+
+	mTempSolverConstraintDescPool.forceSize_Unsafe(0);
+	mTempSolverConstraintDescPool.reserve((totalConstraintCount + 63) & (~63));
+	mTempSolverConstraintDescPool.forceSize_Unsafe(totalConstraintCount);
+
+	mContactConstraintBatchHeaders.forceSize_Unsafe(0);
+	mContactConstraintBatchHeaders.reserve((totalConstraintCount + 63) & (~63));
+	mContactConstraintBatchHeaders.forceSize_Unsafe(totalConstraintCount);
+
+	mContactList.forceSize_Unsafe(0);
+	mContactList.reserve((nbActiveContactManagers +63u) & (~63u));
+	mContactList.forceSize_Unsafe(nbActiveContactManagers);
+
+	mMotionVelocityArray.forceSize_Unsafe(0);
+	mMotionVelocityArray.reserve((bodyCount + 63u) & (~63u));
+	mMotionVelocityArray.forceSize_Unsafe(bodyCount);
+
+	mBodyCoreArray.forceSize_Unsafe(0);
+	mBodyCoreArray.reserve((bodyCount + 63u) & (~63u));
+	mBodyCoreArray.forceSize_Unsafe(bodyCount);
+
+	mRigidBodyArray.forceSize_Unsafe(0);
+	mRigidBodyArray.reserve((bodyCount + 63u) & (~63u));
+	mRigidBodyArray.forceSize_Unsafe(bodyCount);
+
+	mArticulationArray.forceSize_Unsafe(0);
+	mArticulationArray.reserve((numArtics + 63u) & (~63u));
+	mArticulationArray.forceSize_Unsafe(numArtics);
+
+	mNodeIndexArray.forceSize_Unsafe(0);
+	mNodeIndexArray.reserve((bodyCount + 63u) & (~63u));
+	mNodeIndexArray.forceSize_Unsafe(bodyCount);
+
+
+	ThresholdStream& stream = getThresholdStream();
+	stream.forceSize_Unsafe(0);
+	stream.reserve(Ps::nextPowerOfTwo(nbActiveContactManagers != 0 ? nbActiveContactManagers-1 : nbActiveContactManagers));
+
+	PxU32 constraintIndex = 0;
+
+	//flip exceeded force threshold buffer
+	mCurrentIndex = 1 - mCurrentIndex;
+
+	//create force threshold tasks to produce force change events
+	PxsForceThresholdTask* forceThresholdTask =  PX_PLACEMENT_NEW(getTaskPool().allocateNotThreadSafe(sizeof(PxsForceThresholdTask)), PxsForceThresholdTask)(*this);
+	forceThresholdTask->setContinuation(lostTouchTask);
+
+	const IG::IslandId*const islandIds = islandSim.getActiveIslands();
+
+	PxU32 currentIsland = 0;
+	PxU32 currentBodyIndex = 0;
+	PxU32 currentArticulation = 0;
+	PxU32 currentContact = 0;
+	//while(start<sentinel)
+	while(currentIsland < islandCount)
+	{
+		SolverIslandObjects objectStarts;
+		objectStarts.articulations				= mArticulationArray.begin()+ currentArticulation;
+		objectStarts.bodies						= mRigidBodyArray.begin()	+ currentBodyIndex;
+		objectStarts.contactManagers			= mContactList.begin()	+ currentContact;
+		objectStarts.constraintDescs			= mSolverConstraintDescPool.begin() + constraintIndex;
+		objectStarts.orderedConstraintDescs		= mOrderedSolverConstraintDescPool.begin() + constraintIndex;
+		objectStarts.tempConstraintDescs		= mTempSolverConstraintDescPool.begin() + constraintIndex;
+		objectStarts.constraintBatchHeaders		= mContactConstraintBatchHeaders.begin() + constraintIndex;
+		objectStarts.motionVelocities			= mMotionVelocityArray.begin() + currentBodyIndex;
+		objectStarts.bodyCoreArray				= mBodyCoreArray.begin() + currentBodyIndex;
+		objectStarts.islandIds					= islandIds + currentIsland;
+		objectStarts.bodyRemapTable				= mSolverBodyRemapTable.begin();
+		objectStarts.nodeIndexArray				= mNodeIndexArray.begin() + currentBodyIndex;
+
+		PxU32 startIsland = currentIsland;
+		PxU32 constraintCount = 0;
+
+		PxU32 nbArticulations = 0;
+		PxU32 nbBodies = 0;
+		PxU32 nbConstraints = 0;
+		PxU32 nbContactManagers =0;
+
+		//KS - logic is a bit funky here. We will keep rolling the island together provided currentIsland < islandCount AND either we haven't exceeded the max number of bodies or we have
+		//zero constraints AND we haven't exceeded articulation batch counts (it's still currently beneficial to keep articulations in separate islands but this is only temporary).
+		while((currentIsland < islandCount && (nbBodies < solverBatchMax || constraintCount < minimumConstraintCount)) && nbArticulations < articulationBatchMax)
+		{
+			const IG::Island& island = islandSim.getIsland(islandIds[currentIsland]);
+			nbBodies += island.mSize[IG::Node::eRIGID_BODY_TYPE];
+			nbArticulations += island.mSize[IG::Node::eARTICULATION_TYPE];
+			nbConstraints += island.mEdgeCount[IG::Edge::eCONSTRAINT];
+			nbContactManagers += island.mEdgeCount[IG::Edge::eCONTACT_MANAGER];
+			constraintCount = nbConstraints + nbContactManagers;
+			currentIsland++;
+		}
+
+		
+		objectStarts.numIslands = currentIsland - startIsland;
+
+		constraintIndex += nbArticulations*Dy::DY_ARTICULATION_MAX_SIZE;
+
+		PxsIslandIndices counts;
+		
+		counts.articulations	= nbArticulations;
+		counts.bodies			= nbBodies;
+
+		counts.constraints		= nbConstraints;
+		counts.contactManagers	= nbContactManagers;
+		if(counts.articulations + counts.bodies > 0)
+		{
+			PxBaseTask* task = createSolverTaskChain(*this, objectStarts, counts, 
+				kinematicCount + currentBodyIndex, simpleIslandManager, mSolverBodyRemapTable.begin(), mMaterialManager, forceThresholdTask, mOutputIterator, mUseEnhancedDeterminism);		
+			task->removeReference();
+		}
+
+		currentBodyIndex += nbBodies;
+		currentArticulation += nbArticulations;
+		currentContact += nbContactManagers;
+
+		constraintIndex += constraintCount;
+	}
+
+	//kick off forceThresholdTask
+	forceThresholdTask->removeReference();
+}
+
+void DynamicsContext::updateBodyCore(PxBaseTask* continuation)
+{
+	PX_UNUSED(continuation);
+}
+
+void DynamicsContext::mergeResults()
+{	
+	PX_PROFILE_ZONE("Dynamics.solverMergeResults", mContextID);
+	//OK. Sum up sim stats here...
+
+#if PX_ENABLE_SIM_STATS
+	PxcThreadCoherentCacheIterator<ThreadContext, PxcNpMemBlockPool> threadContextIt(mThreadContextPool);
+	ThreadContext* threadContext = threadContextIt.getNext();
+
+	while(threadContext != NULL)
+	{
+		ThreadContext::ThreadSimStats& threadStats = threadContext->getSimStats();
+		addThreadStats(threadStats);
+		threadStats.clear();
+		threadContext = threadContextIt.getNext();
+	}	
+#endif
+}
+
+
+static void preIntegrationParallel(
+   const PxF32 dt,
+   PxsBodyCore*const* bodyArray,					// INOUT: core body attributes
+  PxsRigidBody*const* originalBodyArray,			// IN: original bodies (LEGACY - DON'T deref the ptrs!!)
+   PxU32 const* nodeIndexArray,					// IN: island node index
+   PxU32 bodyCount,									// IN: body count
+   PxSolverBody* solverBodyPool,					// IN: solver body pool (space preallocated)
+   PxSolverBodyData* solverBodyDataPool,			// IN: solver body data pool (space preallocated)
+   volatile PxU32* maxSolverPositionIterations,
+   volatile PxU32* maxSolverVelocityIterations,
+   const PxVec3& gravity)
+{
+	PxU32 localMaxPosIter = 0;
+	PxU32 localMaxVelIter = 0;
+
+
+	for(PxU32 a = 1; a < bodyCount; ++a)
+	{
+		PxU32 i = a-1;
+		Ps::prefetchLine(bodyArray[a]);
+		Ps::prefetchLine(bodyArray[a],128);
+		Ps::prefetchLine(&solverBodyDataPool[a]);
+		Ps::prefetchLine(&solverBodyDataPool[a],128);
+
+		PxsBodyCore& core = *bodyArray[i];
+		const PxsRigidBody& rBody = *originalBodyArray[i];
+		
+		PxU16 iterWord = core.solverIterationCounts;
+		localMaxPosIter = PxMax<PxU32>(PxU32(iterWord & 0xff), localMaxPosIter);
+		localMaxVelIter = PxMax<PxU32>(PxU32(iterWord >> 8), localMaxVelIter);
+
+		//const Cm::SpatialVector& accel = originalBodyArray[i]->getAccelerationV();
+		bodyCoreComputeUnconstrainedVelocity(gravity, dt, core.linearDamping, core.angularDamping, rBody.accelScale, core.maxLinearVelocitySq, core.maxAngularVelocitySq, 
+			core.linearVelocity, core.angularVelocity, !!(rBody.mInternalFlags & PxcRigidBody::eDISABLE_GRAVITY));
+
+		copyToSolverBodyData(core.linearVelocity, core.angularVelocity, core.inverseMass, core.inverseInertia, core.body2World, core.maxPenBias, core.maxContactImpulse, nodeIndexArray[i], 
+			core.contactReportThreshold, solverBodyDataPool[i + 1], core.lockFlags);
+		solverBodyPool[i].solverProgress = 0;
+		solverBodyPool[i].maxSolverNormalProgress = 0;
+		solverBodyPool[i].maxSolverFrictionProgress = 0;
+	}
+	const PxU32 i = bodyCount - 1;
+	PxsBodyCore& core = *bodyArray[i];
+	const PxsRigidBody& rBody = *originalBodyArray[i];
+		
+	PxU16 iterWord = core.solverIterationCounts;
+	localMaxPosIter = PxMax<PxU32>(PxU32(iterWord & 0xff), localMaxPosIter);
+	localMaxVelIter = PxMax<PxU32>(PxU32(iterWord >> 8), localMaxVelIter);
+
+	bodyCoreComputeUnconstrainedVelocity(gravity, dt, core.linearDamping, core.angularDamping, rBody.accelScale, core.maxLinearVelocitySq, core.maxAngularVelocitySq,
+		core.linearVelocity, core.angularVelocity, !!(rBody.mInternalFlags & PxcRigidBody::eDISABLE_GRAVITY));
+
+	copyToSolverBodyData(core.linearVelocity, core.angularVelocity, core.inverseMass, core.inverseInertia, core.body2World, core.maxPenBias, core.maxContactImpulse, nodeIndexArray[i], 
+		core.contactReportThreshold, solverBodyDataPool[i + 1], core.lockFlags);
+	solverBodyPool[i].solverProgress = 0;
+	solverBodyPool[i].maxSolverNormalProgress = 0;
+	solverBodyPool[i].maxSolverFrictionProgress = 0;
+
+	physx::shdfnd::atomicMax(reinterpret_cast<volatile PxI32*>(maxSolverPositionIterations), PxI32(localMaxPosIter));
+	physx::shdfnd::atomicMax(reinterpret_cast<volatile PxI32*>(maxSolverVelocityIterations), PxI32(localMaxVelIter));
+}
+
+
+void PxsPreIntegrateTask::runInternal()
+{
+	{
+		preIntegrationParallel(mDt, mBodyArray + mStartIndex, mOriginalBodyArray + mStartIndex, mNodeIndexArray + mStartIndex, mNumToIntegrate,
+							mSolverBodies + mStartIndex, mSolverBodyDataPool + mStartIndex,
+							mMaxSolverPositionIterations, mMaxSolverVelocityIterations, mGravity);
+	}
+}
+
+void DynamicsContext::preIntegrationParallel(
+   const PxF32 dt,
+   PxsBodyCore*const* bodyArray,					// INOUT: core body attributes
+   PxsRigidBody*const* originalBodyArray,			// IN: original bodies (LEGACY - DON'T deref the ptrs!!)
+   PxU32 const* nodeIndexArray,						// IN: island node index
+   PxU32 bodyCount,									// IN: body count
+   PxSolverBody* solverBodyPool,					// IN: solver body pool (space preallocated)
+   PxSolverBodyData* solverBodyDataPool,			// IN: solver body data pool (space preallocated)
+   Cm::SpatialVector* /*motionVelocityArray*/,			// OUT: motion velocities
+   PxU32& maxSolverPositionIterations,
+   PxU32& maxSolverVelocityIterations,
+   PxBaseTask& task
+   )
+{
+	//TODO - make this based on some variables so we can try different configurations
+	const PxU32 IntegrationPerThread = 256;
+
+	const PxU32 numTasks = ((bodyCount + IntegrationPerThread-1)/IntegrationPerThread);
+	const PxU32 taskBatchSize = 64;
+
+	for(PxU32 i = 0; i < numTasks; i+=taskBatchSize)
+	{
+		const PxU32 nbTasks = PxMin(numTasks - i, taskBatchSize);
+		PxsPreIntegrateTask* tasks = reinterpret_cast<PxsPreIntegrateTask*>(getTaskPool().allocate(sizeof(PxsPreIntegrateTask)*nbTasks));
+		for(PxU32 a = 0; a < nbTasks; ++a)
+		{
+			PxU32 startIndex = (i+a)*IntegrationPerThread;
+			PxU32 nbToIntegrate = PxMin((bodyCount-startIndex), IntegrationPerThread);
+			PxsPreIntegrateTask* pTask = PX_PLACEMENT_NEW(&tasks[a], PxsPreIntegrateTask)(*this, bodyArray,
+							originalBodyArray, nodeIndexArray, solverBodyPool, solverBodyDataPool, dt, bodyCount,
+							&maxSolverPositionIterations, &maxSolverVelocityIterations, startIndex, 
+							nbToIntegrate, mGravity);
+
+			pTask->setContinuation(&task);
+			pTask->removeReference();
+		}
+	}
+
+	PxMemZero(solverBodyPool, bodyCount * sizeof(PxSolverBody));
+}
+
+inline void WaitBodyRequiredState(volatile PxU32* state, PxU32 requiredState)
+{
+	while(requiredState != *state );
+}
+
+void solveParallel(SOLVER_PARALLEL_METHOD_ARGS)
+{
+	context.solveParallel(params, islandSim);
+}
+
+
+void DynamicsContext::solveParallel(SolverIslandParams& params, IG::IslandSim& islandSim)
+{
+	PxI32 targetCount = mSolverCore[mFrictionType]->solveVParallelAndWriteBack(params);
+
+	PxI32* solveCount = &params.constraintIndex2;
+
+	//PxI32 targetCount = (PxI32)(params.numConstraintHeaders * (params.velocityIterations + params.positionIterations));
+
+	WAIT_FOR_PROGRESS_NO_TIMER(solveCount, targetCount);
+
+	integrateCoreParallel(params, islandSim);
+}
+
+void DynamicsContext::integrateCoreParallel(SolverIslandParams& params, IG::IslandSim& islandSim)
+{
+	const PxI32 unrollCount = 128;
+
+	PxI32* bodyIntegrationListIndex = &params.bodyIntegrationListIndex;
+
+	PxI32 index = physx::shdfnd::atomicAdd(bodyIntegrationListIndex, unrollCount) - unrollCount;
+
+	const PxI32 numBodies = PxI32(params.bodyListSize);
+	const PxI32 numArtics = PxI32(params.articulationListSize);
+
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+	PxsBodyCore*const* bodyArray = params.bodyArray;
+	PxsRigidBody** PX_RESTRICT rigidBodies = params.rigidBodies;
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+
+	PxI32 numIntegrated = 0;
+
+	PxI32 bodyRemainder = unrollCount;
+
+	while(index < numArtics)
+	{
+		const PxI32 remainder = PxMin(numArtics - index, unrollCount);
+		bodyRemainder -= remainder;
+
+		for(PxI32 a = 0; a < remainder; ++a, index++)
+		{
+			const PxI32 i = index;
+			{
+				PX_PROFILE_ZONE("Articulations.integrate", mContextID);
+
+				ArticulationPImpl::updateBodies(articulationListStart[i], mDt);
+			}
+
+			++numIntegrated;
+		}
+		if(bodyRemainder == 0)
+		{
+			index = physx::shdfnd::atomicAdd(bodyIntegrationListIndex, unrollCount) - unrollCount;
+			bodyRemainder = unrollCount;
+		}
+	}	
+
+	index -= numArtics;
+
+	const PxI32 unrollPlusArtics = unrollCount + numArtics;
+
+	PxSolverBody* PX_RESTRICT solverBodies = params.bodyListStart;
+	PxSolverBodyData* PX_RESTRICT solverBodyData = params.bodyDataList + params.solverBodyOffset+1;
+
+	while(index < numBodies)
+	{
+		const PxI32 remainder = PxMin(numBodies - index, bodyRemainder);
+		bodyRemainder -= remainder;
+		for(PxI32 a = 0; a < remainder; ++a, index++)
+		{
+			const PxI32 prefetch = PxMin(index+4, numBodies - 1);
+			Ps::prefetchLine(bodyArray[prefetch]);
+			Ps::prefetchLine(bodyArray[prefetch],128);
+			Ps::prefetchLine(&solverBodies[index],128);
+			Ps::prefetchLine(&motionVelocityArray[index],128);
+			Ps::prefetchLine(&bodyArray[index+32]);
+			Ps::prefetchLine(&rigidBodies[prefetch]);
+			
+			PxSolverBodyData& data = solverBodyData[index];
+
+			integrateCore(motionVelocityArray[index].linear, motionVelocityArray[index].angular,
+				solverBodies[index], data, mDt);
+
+			PxsRigidBody& rBody = *rigidBodies[index];
+			PxsBodyCore& core = rBody.getCore();
+			rBody.mLastTransform = core.body2World;
+			core.body2World = data.body2World;
+			core.linearVelocity = data.linearVelocity;
+			core.angularVelocity = data.angularVelocity;
+
+			bool hasStaticTouch = islandSim.getIslandStaticTouchCount(IG::NodeIndex(data.nodeIndex)) != 0;
+			sleepCheck(rigidBodies[index], mDt, mInvDt, mEnableStabilization, mUseAdaptiveForce, motionVelocityArray[index], hasStaticTouch);
+
+			++numIntegrated;
+		}
+
+		{
+			index = physx::shdfnd::atomicAdd(bodyIntegrationListIndex, unrollCount) - unrollPlusArtics;
+			bodyRemainder = unrollCount;
+		}
+	}
+
+	Ps::memoryBarrier();
+	physx::shdfnd::atomicAdd(&params.numObjectsIntegrated, numIntegrated);
+}
+
+class BlockAllocator : public PxConstraintAllocator
+{
+	PxsConstraintBlockManager& mConstraintBlockManager;
+	PxcConstraintBlockStream& mConstraintBlockStream;
+	FrictionPatchStreamPair& mFrictionPatchStreamPair;
+	PxU32& mTotalConstraintByteSize;
+public:
+
+	BlockAllocator(PxsConstraintBlockManager& constraintBlockManager, PxcConstraintBlockStream& constraintBlockStream, FrictionPatchStreamPair& frictionPatchStreamPair,
+		PxU32& totalConstraintByteSize) :
+		mConstraintBlockManager(constraintBlockManager), mConstraintBlockStream(constraintBlockStream), mFrictionPatchStreamPair(frictionPatchStreamPair), 
+		mTotalConstraintByteSize(totalConstraintByteSize)
+	{
+	}
+
+	virtual PxU8* reserveConstraintData(const PxU32 size)
+	{
+		mTotalConstraintByteSize += size;
+		return mConstraintBlockStream.reserve(size, mConstraintBlockManager);
+	}
+
+	virtual PxU8* reserveFrictionData(const PxU32 size)
+	{
+		return mFrictionPatchStreamPair.reserve<PxU8>(size);
+	}
+
+	virtual PxU8* findInputPatches(PxU8* frictionCookie)
+	{
+		return frictionCookie;
+	}
+
+	PX_NOCOPY(BlockAllocator)
+
+};
+
+
+
+static PxU32 createFinalizeContacts_Parallel(PxSolverBodyData* solverBodyData, ThreadContext& mThreadContext, DynamicsContext& context,
+									  PxU32 startIndex, PxU32 endIndex, PxsContactManagerOutputIterator& outputs)
+{
+	const PxFrictionType::Enum frictionType = context.getFrictionType();
+	const PxReal bounceThreshold = context.getBounceThreshold();
+	const PxReal frictionOffsetThreshold = context.getFrictionOffsetThreshold();
+	const PxReal dt = context.getDt();
+	const PxReal invDt = context.getInvDt();
+
+	PxSolverConstraintDesc* contactDescPtr = mThreadContext.orderedContactConstraints;
+
+	PxConstraintBatchHeader* headers = mThreadContext.contactConstraintBatchHeaders;
+	
+	PxI32 axisConstraintCount = 0;
+	ThreadContext* threadContext = context.getThreadContext();
+	threadContext->mConstraintBlockStream.reset(); //ensure there's no left-over memory that belonged to another island
+
+	PxTransform idt(PxIdentity);
+
+	BlockAllocator blockAllocator(mThreadContext.mConstraintBlockManager, threadContext->mConstraintBlockStream, threadContext->mFrictionPatchStreamPair, threadContext->mConstraintSize);
+
+	const PxReal ccdMaxSeparation = context.getCCDSeparationThreshold();
+
+	for(PxU32 a = startIndex; a < endIndex; ++a)
+	{
+
+		PxConstraintBatchHeader& header = headers[a];
+
+		if(contactDescPtr[header.mStartIndex].constraintLengthOver16 == DY_SC_TYPE_RB_CONTACT)
+		{
+			SolverConstraintPrepState::Enum state = SolverConstraintPrepState::eUNBATCHABLE;
+
+			PxSolverContactDesc blockDescs[4];
+			PxsContactManagerOutput* cmOutputs[4];
+			PxsContactManager* cms[4];
+			for (PxU32 i = 0; i < header.mStride; ++i)
+			{
+				PxSolverConstraintDesc& desc = contactDescPtr[header.mStartIndex + i];
+				PxSolverContactDesc& blockDesc = blockDescs[i];
+				PxsContactManager* cm = reinterpret_cast<PxsContactManager*>(desc.constraint);
+
+				cms[i] = cm;
+
+				PxcNpWorkUnit& unit = cm->getWorkUnit();
+
+				cmOutputs[i] = &outputs.getContactManager(unit.mNpIndex);
+
+				PxSolverBodyData& data0 = desc.linkIndexA != 0xffff ? solverBodyData[0] : solverBodyData[desc.bodyADataIndex];
+				PxSolverBodyData& data1 = desc.linkIndexB != 0xffff ? solverBodyData[0] : solverBodyData[desc.bodyBDataIndex];
+
+				blockDesc.data0 = &data0;
+				blockDesc.data1 = &data1;
+
+				PxU8 flags = unit.rigidCore0->mFlags;
+				if (unit.rigidCore1)
+					flags |= PxU8(unit.rigidCore1->mFlags);
+
+				blockDesc.bodyFrame0 = unit.rigidCore0->body2World;
+				blockDesc.bodyFrame1 = unit.rigidCore1 ? unit.rigidCore1->body2World : idt;
+				blockDesc.shapeInteraction = cm->getShapeInteraction();
+				blockDesc.contactForces = cmOutputs[i]->contactForces;
+				blockDesc.desc = &desc;
+				blockDesc.body0 = desc.bodyA;
+				blockDesc.body1 = desc.bodyB;
+				blockDesc.hasForceThresholds = !!(unit.flags & PxcNpWorkUnitFlag::eFORCE_THRESHOLD);
+				blockDesc.disableStrongFriction = !!(unit.flags & PxcNpWorkUnitFlag::eDISABLE_STRONG_FRICTION);
+				blockDesc.bodyState0 = (unit.flags & PxcNpWorkUnitFlag::eARTICULATION_BODY0) ? PxSolverContactDesc::eARTICULATION : PxSolverContactDesc::eDYNAMIC_BODY;
+				blockDesc.bodyState1 = (unit.flags & PxcNpWorkUnitFlag::eARTICULATION_BODY1) ? PxSolverContactDesc::eARTICULATION : (unit.flags & PxcNpWorkUnitFlag::eHAS_KINEMATIC_ACTOR) ? PxSolverContactDesc::eKINEMATIC_BODY :
+					((unit.flags & PxcNpWorkUnitFlag::eDYNAMIC_BODY1) ? PxSolverContactDesc::eDYNAMIC_BODY : PxSolverContactDesc::eSTATIC_BODY);
+				//blockDesc.flags = unit.flags;
+
+				PxReal dominance0 = unit.dominance0 ? 1.f : 0.f;
+				PxReal dominance1 = unit.dominance1 ? 1.f : 0.f;
+
+				blockDesc.mInvMassScales.linear0 = blockDesc.mInvMassScales.angular0 = dominance0;
+				blockDesc.mInvMassScales.linear1 = blockDesc.mInvMassScales.angular1 = dominance1;
+				blockDesc.restDistance = unit.restDistance;
+				blockDesc.frictionPtr = unit.frictionDataPtr;
+				blockDesc.frictionCount = unit.frictionPatchCount;
+				blockDesc.maxCCDSeparation = (flags & PxRigidBodyFlag::eENABLE_SPECULATIVE_CCD) ? ccdMaxSeparation : PX_MAX_F32;
+
+			}
+
+			if(header.mStride == 4)
+			{
+				//KS - todo - plumb in axisConstraintCount into this method to keep track of the number of axes
+				state = createFinalizeMethods4[frictionType](cmOutputs, *threadContext,
+					 blockDescs,
+					 invDt,
+					 bounceThreshold,
+					 frictionOffsetThreshold,
+					 context.getCorrelationDistance(),
+					 blockAllocator);
+
+			}
+			if(SolverConstraintPrepState::eSUCCESS != state)
+			{
+				for(PxU32 i = 0; i < header.mStride; ++i)
+				{
+					PxSolverConstraintDesc& desc = contactDescPtr[header.mStartIndex+i];
+					PxsContactManager* cm = reinterpret_cast<PxsContactManager*>(desc.constraint);
+					PxcNpWorkUnit& n = cm->getWorkUnit();
+
+					PxsContactManagerOutput& output = outputs.getContactManager(n.mNpIndex);
+					
+					createFinalizeMethods[frictionType](blockDescs[i], output, *threadContext,
+						invDt, bounceThreshold, frictionOffsetThreshold, context.getCorrelationDistance(), blockAllocator);
+			
+					getContactManagerConstraintDesc(output,*cm,desc);
+				}
+			}
+
+			for (PxU32 i = 0; i < header.mStride; ++i)
+			{
+				PxsContactManager* cm = cms[i];
+
+				PxcNpWorkUnit& unit = cm->getWorkUnit();
+				unit.frictionDataPtr = blockDescs[i].frictionPtr;
+				unit.frictionPatchCount = blockDescs[i].frictionCount;
+				axisConstraintCount += blockDescs[i].axisConstraintCount;
+
+			}
+		}
+		else if(contactDescPtr[header.mStartIndex].constraintLengthOver16 == DY_SC_TYPE_RB_1D)
+		{
+
+			SolverConstraintShaderPrepDesc shaderDescs[4];
+			PxSolverConstraintPrepDesc descs[4];
+
+			PxTransform id(PxIdentity);
+
+			for (PxU32 i = 0; i < header.mStride; ++i)
+			{
+				PxSolverConstraintDesc& desc = contactDescPtr[header.mStartIndex + i];
+				const Constraint* constraint = reinterpret_cast<const Constraint*>(desc.constraint);
+
+				SolverConstraintShaderPrepDesc& shaderPrepDesc = shaderDescs[i];
+				PxSolverConstraintPrepDesc& prepDesc = descs[i];
+
+				const PxConstraintSolverPrep solverPrep = constraint->solverPrep;
+				const void* constantBlock = constraint->constantBlock;
+				const PxU32 constantBlockByteSize = constraint->constantBlockSize;
+				const PxTransform& pose0 = (constraint->body0 ? constraint->body0->getPose() : id);
+				const PxTransform& pose1 = (constraint->body1 ? constraint->body1->getPose() : id);
+				const PxSolverBody* sbody0 = desc.bodyA;
+				const PxSolverBody* sbody1 = desc.bodyB;
+				PxSolverBodyData* sbodyData0 = &solverBodyData[desc.linkIndexA != PxSolverConstraintDesc::NO_LINK ? 0 : desc.bodyADataIndex];
+				PxSolverBodyData* sbodyData1 = &solverBodyData[desc.linkIndexB != PxSolverConstraintDesc::NO_LINK ? 0 : desc.bodyBDataIndex];
+
+				shaderPrepDesc.constantBlock = constantBlock;
+				shaderPrepDesc.constantBlockByteSize = constantBlockByteSize;
+				shaderPrepDesc.constraint = constraint;
+				shaderPrepDesc.solverPrep = solverPrep;
+
+				prepDesc.desc = &desc;
+				prepDesc.bodyFrame0 = pose0;
+				prepDesc.bodyFrame1 = pose1;
+				prepDesc.data0 = sbodyData0;
+				prepDesc.data1 = sbodyData1;
+				prepDesc.body0 = sbody0;
+				prepDesc.body1 = sbody1;
+				prepDesc.linBreakForce = constraint->linBreakForce;
+				prepDesc.angBreakForce = constraint->angBreakForce;
+				prepDesc.writeback = &context.getConstraintWriteBackPool()[constraint->index];
+				prepDesc.disablePreprocessing = !!(constraint->flags & PxConstraintFlag::eDISABLE_PREPROCESSING);				
+				prepDesc.improvedSlerp = !!(constraint->flags & PxConstraintFlag::eIMPROVED_SLERP);
+				prepDesc.driveLimitsAreForces = !!(constraint->flags & PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES);
+				prepDesc.minResponseThreshold = constraint->minResponseThreshold;
+			}
+
+#if DY_BATCH_1D
+			SolverConstraintPrepState::Enum state = SolverConstraintPrepState::eUNBATCHABLE;
+			if(header.mStride == 4)
+			{
+				PxU32 totalRows;
+				state = setupSolverConstraint4
+					(shaderDescs, descs, dt, invDt, totalRows,
+					blockAllocator);
+
+				axisConstraintCount += totalRows;
+			}
+			if(state != SolverConstraintPrepState::eSUCCESS)
+#endif
+			{
+				for(PxU32 i = 0; i < header.mStride; ++i)
+				{
+					axisConstraintCount += SetupSolverConstraint(shaderDescs[i], descs[i], blockAllocator, dt, invDt);
+				}
+			}
+		}
+	}
+
+	threadContext->getSimStats().numAxisSolverConstraints += axisConstraintCount;
+
+	context.putThreadContext(threadContext);
+	return PxU32(axisConstraintCount); //Can't write to mThreadContext as it's shared!!!!
+}
+
+class PxsCreateFinalizeContactsTask : public Cm::Task
+{
+	PxsCreateFinalizeContactsTask& operator=(const PxsCreateFinalizeContactsTask&);
+public:
+	PxsCreateFinalizeContactsTask( const PxU32 numConstraints, PxSolverConstraintDesc* descArray, PxSolverBodyData* solverBodyData,
+		ThreadContext& threadContext, DynamicsContext& context, PxU32 startIndex, PxU32 endIndex, PxsContactManagerOutputIterator& outputs) :
+			mNumConstraints(numConstraints), mDescArray(descArray), mSolverBodyData(solverBodyData),
+			mThreadContext(threadContext), mDynamicsContext(context),
+			mOutputs(outputs),
+			mStartIndex(startIndex), mEndIndex(endIndex)
+	{}
+
+	virtual void runInternal()
+	{
+		createFinalizeContacts_Parallel(mSolverBodyData, mThreadContext, mDynamicsContext, mStartIndex, mEndIndex, mOutputs);
+	}
+
+	virtual const char* getName() const
+	{
+		return "PxsDynamics.createFinalizeContacts";
+	}
+
+public:
+	const PxU32 mNumConstraints;
+	PxSolverConstraintDesc* mDescArray;
+	PxSolverBodyData* mSolverBodyData;
+	ThreadContext& mThreadContext;
+	DynamicsContext& mDynamicsContext;
+	PxsContactManagerOutputIterator& mOutputs;
+	PxU32 mStartIndex;
+	PxU32 mEndIndex;
+};
+
+void PxsSolverCreateFinalizeConstraintsTask::runInternal()
+{
+	ThreadContext& mThreadContext = *mIslandContext.mThreadContext;
+
+
+	
+	PxU32 descCount = mThreadContext.mNumDifferentBodyConstraints;
+	PxU32 selfConstraintDescCount = mThreadContext.contactDescArraySize - mThreadContext.mNumDifferentBodyConstraints;
+
+	Ps::Array<PxU32>& accumulatedConstraintsPerPartition = mThreadContext.mConstraintsPerPartition;
+
+	PxU32 numHeaders = 0;
+	PxU32 currentPartition = 0;
+	PxU32 maxJ = descCount == 0 ? 0 : accumulatedConstraintsPerPartition[0];
+
+	const PxU32 maxBatchPartition = 0xFFFFFFFF;
+
+	const PxU32 maxBatchSize = mEnhancedDeterminism ? 1u : 4u;
+
+	PxU32 headersPerPartition = 0;
+	for(PxU32 a = 0; a < descCount;)
+	{
+		
+
+		PxU32 loopMax = PxMin(maxJ - a, maxBatchSize);
+		PxU16 j = 0;
+		if(loopMax > 0)
+		{
+			PxConstraintBatchHeader& header = mThreadContext.contactConstraintBatchHeaders[numHeaders++];
+			
+			j=1;
+			PxSolverConstraintDesc& desc = mThreadContext.orderedContactConstraints[a];
+			if(!isArticulationConstraint(desc) && (desc.constraintLengthOver16 == DY_SC_TYPE_RB_CONTACT || 
+				desc.constraintLengthOver16 == DY_SC_TYPE_RB_1D) && currentPartition < maxBatchPartition)
+			{
+				for(; j < loopMax && desc.constraintLengthOver16 == mThreadContext.orderedContactConstraints[a+j].constraintLengthOver16 && 
+					!isArticulationConstraint(mThreadContext.orderedContactConstraints[a+j]); ++j);
+			}
+			header.mStartIndex = a;
+			header.mStride = j;
+			headersPerPartition++;
+		}
+		if(maxJ == (a + j) && maxJ != descCount)
+		{
+			//Go to next partition!
+			accumulatedConstraintsPerPartition[currentPartition] = headersPerPartition;
+			headersPerPartition = 0;
+			currentPartition++;
+			maxJ = accumulatedConstraintsPerPartition[currentPartition];
+		}
+		a+= j;
+	}
+	if(descCount)
+		accumulatedConstraintsPerPartition[currentPartition] = headersPerPartition;
+
+	
+
+	accumulatedConstraintsPerPartition.forceSize_Unsafe(mThreadContext.mMaxPartitions);
+
+	PxU32 numDifferentBodyBatchHeaders = numHeaders;
+
+	for(PxU32 a = 0; a < selfConstraintDescCount; ++a)
+	{
+		PxConstraintBatchHeader& header = mThreadContext.contactConstraintBatchHeaders[numHeaders++];
+		header.mStartIndex = a + descCount;
+		header.mStride = 1;
+	}
+
+	PxU32 numSelfConstraintBatchHeaders = numHeaders - numDifferentBodyBatchHeaders;
+
+	mThreadContext.numDifferentBodyBatchHeaders = numDifferentBodyBatchHeaders;
+	mThreadContext.numSelfConstraintBatchHeaders = numSelfConstraintBatchHeaders;
+	mThreadContext.numContactConstraintBatches = numHeaders;
+
+	PX_UNUSED(descCount);
+
+	{
+		PxSolverConstraintDesc* descBegin = mThreadContext.orderedContactConstraints;
+
+		const PxU32 numThreads = getTaskManager()->getCpuDispatcher()->getWorkerCount();
+				
+		//Choose an appropriate number of constraint prep tasks. This must be proportionate to the number of constraints to prep and the number
+		//of worker threads available.
+		const PxU32 TaskBlockSize = 16;
+		const PxU32 TaskBlockLargeSize = 64;
+		const PxU32 BlockAllocationSize = 64;
+
+		PxU32 numTasks = (numHeaders+TaskBlockLargeSize-1)/TaskBlockLargeSize;
+
+		if(numTasks)
+		{
+
+			if(numTasks < numThreads)
+				numTasks = PxMax(1u, (numHeaders+TaskBlockSize-1)/TaskBlockSize);
+
+			const PxU32 constraintsPerTask = (numHeaders + numTasks-1)/numTasks;
+
+			for(PxU32 i = 0; i < numTasks; i+=BlockAllocationSize)
+			{
+				PxU32 blockSize = PxMin(numTasks - i, BlockAllocationSize);
+
+				PxsCreateFinalizeContactsTask* tasks = reinterpret_cast<PxsCreateFinalizeContactsTask*>(mContext.getTaskPool().allocate(sizeof(PxsCreateFinalizeContactsTask)*blockSize));
+
+				for(PxU32 a = 0; a < blockSize; ++a)
+				{
+					PxU32 startIndex = (a + i) * constraintsPerTask;
+					PxU32 endIndex = PxMin(startIndex + constraintsPerTask, numHeaders);
+					PxsCreateFinalizeContactsTask* pTask = PX_PLACEMENT_NEW(&tasks[a], PxsCreateFinalizeContactsTask( descCount, descBegin, mContext.mSolverBodyDataPool.begin(), mThreadContext, mContext, startIndex, endIndex, mOutputs));
+
+					pTask->setContinuation(mCont);
+					pTask->removeReference();
+				}
+			}
+		}
+	}
+}
+
+}
+}
+
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyDynamics.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyDynamics.h
new file mode 100644
index 00000000..9fb1d94d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyDynamics.h
@@ -0,0 +1,483 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_DYNAMICS_H
+#define DY_DYNAMICS_H
+
+#include "PxvConfig.h"
+#include "CmSpatialVector.h"
+#include "CmTask.h"
+#include "CmPool.h"
+#include "PxcThreadCoherentCache.h"
+#include "DyThreadContext.h"
+#include "PxcConstraintBlockStream.h"
+#include "DySolverBody.h"
+#include "DyContext.h"
+#include "PxsIslandManagerTypes.h"
+#include "PxvNphaseImplementationContext.h"
+#include "solver/PxSolverDefs.h"
+
+namespace physx
+{
+
+namespace Cm
+{
+	class FlushPool;
+}
+
+namespace IG
+{
+	class SimpleIslandManager;
+	struct Edge;
+}
+
+class PxsRigidBody;
+
+class PxsStreamedThresholdTable;
+
+struct PxsBodyCore;
+struct PxsIslandObjects;
+class PxsIslandIndices;
+struct PxsIndexedInteraction;
+class PxsIslandManager;
+struct PxsIndexedConstraint;
+struct PxsIndexedContactManager;
+class PxsHeapMemoryAllocator;
+class PxsMemoryManager;
+class PxsDefaultMemoryManager;
+struct PxSolverConstraintDesc;
+
+namespace Cm
+{
+	class Bitmap;
+	class SpatialVector;
+}
+
+namespace Dy
+{
+	class SolverCore;
+	struct SolverIslandParams;
+	struct ArticulationSolverDesc;
+	class Articulation;
+	class DynamicsContext;
+
+
+
+
+#define SOLVER_PARALLEL_METHOD_ARGS									\
+	DynamicsContext&	context,									\
+	SolverIslandParams& params,										\
+	IG::IslandSim& islandSim
+
+//typedef	void (*PxsSolveParallelMethod)(SOLVER_PARALLEL_METHOD_ARGS);
+//extern PxsSolveParallelMethod solveParallel[3];
+
+void solveParallel(SOLVER_PARALLEL_METHOD_ARGS);
+void solveParallelCouloumFriction(SOLVER_PARALLEL_METHOD_ARGS);
+
+
+struct SolverIslandObjects;
+
+/**
+\brief Solver body pool (array) that enforces 128-byte alignment for base address of array.
+\note This reduces cache misses on platforms with 128-byte-size cache lines by aligning the start of the array to the beginning of a cache line.
+*/
+class SolverBodyPool : public Ps::Array<PxSolverBody, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxSolverBody> > > 
+{ 
+	PX_NOCOPY(SolverBodyPool)
+public:
+	SolverBodyPool() {}
+};
+
+/**
+\brief Solver body data pool (array) that enforces 128-byte alignment for base address of array.
+\note This reduces cache misses on platforms with 128-byte-size cache lines by aligning the start of the array to the beginning of a cache line.
+*/
+class SolverBodyDataPool : public Ps::Array<PxSolverBodyData, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxSolverBodyData> > >
+{
+	PX_NOCOPY(SolverBodyDataPool)
+public:
+	SolverBodyDataPool() {}
+};
+
+class SolverConstraintDescPool : public Ps::Array<PxSolverConstraintDesc, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxSolverConstraintDesc> > >
+{
+	PX_NOCOPY(SolverConstraintDescPool)
+public:
+	SolverConstraintDescPool() { }
+};
+
+/**
+\brief Encapsulates an island's context
+*/
+
+struct IslandContext
+{
+	//The thread context for this island (set in in the island start task, released in the island end task)
+	ThreadContext* mThreadContext;
+	PxsIslandIndices		mCounts;
+};
+
+
+/**
+\brief Encapsules the data used by the constraint solver.
+*/
+
+#if PX_VC 
+    #pragma warning(push)
+	#pragma warning( disable : 4324 ) // Padding was added at the end of a structure because of a __declspec(align) value.
+#endif
+
+
+class DynamicsContext : public Context
+{
+	PX_NOCOPY(DynamicsContext)
+public:
+	
+	/**
+	\brief Creates a DynamicsContext associated with a PxsContext
+	\return A pointer to the newly-created DynamicsContext.
+	*/
+	static DynamicsContext*	create(	PxcNpMemBlockPool* memBlockPool,
+									PxcScratchAllocator& scratchAllocator,
+									Cm::FlushPool& taskPool,
+									PxvSimStats& simStats,
+									PxTaskManager* taskManager,
+									Ps::VirtualAllocatorCallback* allocator,
+									PxsMaterialManager* materialManager,
+									IG::IslandSim* accurateIslandSim,
+									PxU64 contextID,
+									const bool enableStabilization,
+									const bool useEnhancedDeterminism,
+									const bool useAdaptiveForce
+									);
+	
+	/**
+	\brief Destroys this DynamicsContext
+	*/
+	void						destroy();
+
+	/**
+	\brief Returns the static world solver body
+	\return The static world solver body.
+	*/
+	PX_FORCE_INLINE PxSolverBody&		getWorldSolverBody()					{ return mWorldSolverBody;  }
+
+	PX_FORCE_INLINE Cm::FlushPool&			getTaskPool()						{ return mTaskPool;			}
+
+	PX_FORCE_INLINE ThresholdStream&		getThresholdStream()					{ return *mThresholdStream;	}
+
+	PX_FORCE_INLINE PxvSimStats&			getSimStats()							{ return mSimStats;			}
+
+#if PX_ENABLE_SIM_STATS
+	void									addThreadStats(const ThreadContext::ThreadSimStats& stats);
+#endif
+
+	/**
+	\brief The entry point for the constraint solver. 
+	\param[in]	dt	The simulation time-step
+	\param[in]	continuation The continuation task for the solver
+
+	This method is called after the island generation has completed. Its main responsibilities are:
+	(1) Reserving the solver body pools
+	(2) Initializing the static and kinematic solver bodies, which are shared resources between islands.
+	(3) Construct the solver task chains for each island
+
+	Each island is solved as an independent solver task chain in parallel.
+
+	*/
+
+	virtual void						update(IG::SimpleIslandManager& simpleIslandManager, PxBaseTask* continuation, PxBaseTask* lostTouchTask,
+		PxsContactManager** foundPatchManagers, PxU32 nbFoundPatchManagers, PxsContactManager** lostPatchManagers, PxU32 nbLostPatchManagers,
+		PxU32 maxPatchesPerCM, PxsContactManagerOutputIterator& iter, PxsContactManagerOutput* gpuOutputs, const PxReal dt, const PxVec3& gravity, const PxU32 bitMapWordCounts);
+
+	virtual void						processLostPatches(IG::SimpleIslandManager& /*simpleIslandManager*/, PxsContactManager** /*lostPatchManagers*/, PxU32 /*nbLostPatchManagers*/, PxsContactManagerOutputIterator& /*iterator*/){}
+
+	virtual void						updateBodyCore(PxBaseTask* continuation);
+
+	virtual void						setSimulationController(PxsSimulationController* simulationController ){ mSimulationController = simulationController; }
+	/**
+	\brief This method combines the results of several islands, e.g. constructing scene-level simulation statistics and merging together threshold streams for contact notification.
+	*/
+	virtual void							mergeResults();
+
+	virtual void							getDataStreamBase(void*& /*contactStreamBase*/, void*& /*patchStreamBase*/, void*& /*forceAndIndicesStreamBase*/){}
+
+	/**
+	\brief Allocates and returns a thread context object.
+	\return A thread context.
+	*/
+	PX_FORCE_INLINE ThreadContext*					getThreadContext()
+	{
+		return mThreadContextPool.get();
+	}
+
+	/**
+	\brief Returns a thread context to the thread context pool.
+	\param[in] context The thread context to return to the thread context pool. 
+	*/
+	void								putThreadContext(ThreadContext* context)
+	{
+		mThreadContextPool.put(context);
+	}
+
+
+	PX_FORCE_INLINE	PxU32					getKinematicCount()		const	{ return mKinematicCount;	}
+	PX_FORCE_INLINE	PxU64					getContextId()			const	{ return mContextID;		}
+
+protected:
+
+	/**
+	\brief Constructor for DynamicsContext
+	*/
+										DynamicsContext(PxcNpMemBlockPool* memBlockPool,
+														PxcScratchAllocator& scratchAllocator,
+														Cm::FlushPool& taskPool,
+														PxvSimStats& simStats,
+														PxTaskManager* taskManager,
+														Ps::VirtualAllocatorCallback* allocator,
+														PxsMaterialManager* materialManager,
+														IG::IslandSim* accurateIslandSim,
+														PxU64 contextID,
+														const bool enableStabilization,
+														const bool useEnhancedDeterminism,
+														const bool useAdaptiveForce
+														);
+	/**
+	\brief Destructor for DynamicsContext
+	*/
+	virtual								~DynamicsContext();
+
+
+	// Solver helper-methods
+	/**
+	\brief Computes the unconstrained velocity for a given PxsRigidBody
+	\param[in] atom The PxsRigidBody
+	*/
+	void								computeUnconstrainedVelocity(PxsRigidBody* atom)	const;
+
+	/**
+	\brief fills in a PxSolverConstraintDesc from an indexed interaction
+	\param[in,out] desc The PxSolverConstraintDesc
+	\param[in] constraint The PxsIndexedInteraction
+	*/
+	void								setDescFromIndices(PxSolverConstraintDesc& desc, 
+													  const PxsIndexedInteraction& constraint, const PxU32 solverBodyOffset);
+
+
+	void								setDescFromIndices(PxSolverConstraintDesc& desc, IG::EdgeIndex edgeIndex,
+											const IG::SimpleIslandManager& islandManager, PxU32* bodyRemapTable, const PxU32 solverBodyOffset);
+
+	/**
+	\brief Compute the unconstrained velocity for set of bodies in parallel. This function may spawn additional tasks.
+	\param[in] dt The timestep
+	\param[in] bodyArray The array of body cores
+	\param[in] originalBodyArray The array of PxsRigidBody
+	\param[in] nodeIndexArray The array of island node index
+	\param[in] bodyCount The number of bodies
+	\param[out] solverBodyPool The pool of solver bodies. These are synced with the corresponding body in bodyArray.
+	\param[out] solverBodyDataPool The pool of solver body data. These are synced with the corresponding body in bodyArray
+	\param[out] motionVelocityArray The motion velocities for the bodies
+	\param[out] maxSolverPositionIterations The maximum number of position iterations requested by any body in the island
+	\param[out] maxSolverVelocityIterations The maximum number of velocity iterations requested by any body in the island
+	\param[out] integrateTask The continuation task for any tasks spawned by this function.
+	*/
+	void								preIntegrationParallel(
+											   const PxF32 dt,
+											   PxsBodyCore*const* bodyArray,					// INOUT: core body attributes
+											   PxsRigidBody*const* originalBodyArray,			// IN: original body atom names (LEGACY - DON'T deref the ptrs!!)
+											   PxU32 const* nodeIndexArray,						// IN: island node index
+											   PxU32 bodyCount,									// IN: body count
+											   PxSolverBody* solverBodyPool,					// IN: solver atom pool (space preallocated)
+											   PxSolverBodyData* solverBodyDataPool,
+											   Cm::SpatialVector* motionVelocityArray,			// OUT: motion velocities
+											   PxU32& maxSolverPositionIterations,
+											   PxU32& maxSolverVelocityIterations,
+											   PxBaseTask& integrateTask
+											   );
+
+	/**
+	\brief Solves an island in parallel.
+
+	\param[in] params Solver parameter structure
+	*/
+
+	void								solveParallel(SolverIslandParams& params, IG::IslandSim& islandSim);
+
+	
+
+	void								integrateCoreParallel(SolverIslandParams& params, IG::IslandSim& islandSim);
+
+
+
+
+	/**
+	\brief Resets the thread contexts
+	*/
+	void									resetThreadContexts();
+
+	/**
+	\brief Returns the scratch memory allocator.
+	\return The scratch memory allocator.
+	*/
+	PX_FORCE_INLINE PxcScratchAllocator&	getScratchAllocator() { return mScratchAllocator; }
+
+	//Data
+
+	/**
+	\brief Body to represent the world static body.
+	*/
+	PX_ALIGN(16, PxSolverBody				mWorldSolverBody);
+	/**
+	\brief Body data to represent the world static body.
+	*/
+	PX_ALIGN(16, PxSolverBodyData			mWorldSolverBodyData);
+
+	/**
+	\brief A thread context pool
+	*/
+	PxcThreadCoherentCache<ThreadContext, PxcNpMemBlockPool> mThreadContextPool;
+
+	/**
+	\brief Solver constraint desc array
+	*/
+	SolverConstraintDescPool	mSolverConstraintDescPool;
+
+	/**
+	\brief Ordered sover constraint desc array (after partitioning)
+	*/
+	SolverConstraintDescPool	mOrderedSolverConstraintDescPool;
+
+	/**
+	\brief A temporary array of constraint descs used for partitioning
+	*/
+	SolverConstraintDescPool	mTempSolverConstraintDescPool;
+
+	/**
+	\brief An array of contact constraint batch headers
+	*/
+	Ps::Array<PxConstraintBatchHeader> mContactConstraintBatchHeaders;
+
+	/**
+	\brief Array of motion velocities for all bodies in the scene.
+	*/
+	Ps::Array<Cm::SpatialVector> mMotionVelocityArray;
+
+	/**
+	\brief Array of body core pointers for all bodies in the scene.
+	*/
+	Ps::Array<PxsBodyCore*>	mBodyCoreArray;
+
+	/**
+	\brief Array of rigid body pointers for all bodies in the scene.
+	*/
+	Ps::Array<PxsRigidBody*> mRigidBodyArray;
+
+	/**
+	\brief Array of articulationpointers for all articulations in the scene.
+	*/
+	Ps::Array<Articulation*> mArticulationArray;
+
+	/**
+	\brief Global pool for solver bodies. Kinematic bodies are at the start, and then dynamic bodies
+	*/
+	SolverBodyPool			mSolverBodyPool;
+	/**
+	\brief Global pool for solver body data. Kinematic bodies are at the start, and then dynamic bodies
+	*/
+	SolverBodyDataPool		mSolverBodyDataPool;
+
+
+	ThresholdStream*		mExceededForceThresholdStream[2]; //this store previous and current exceeded force thresholdStream	
+
+	Ps::Array<PxU32>		mExceededForceThresholdStreamMask;
+
+	/**
+	\brief Interface to the solver core.
+	\note We currently only support PxsSolverCoreSIMD. Other cores may be added in future releases.
+	*/
+	SolverCore*				mSolverCore[PxFrictionType::eFRICTION_COUNT];
+
+	Ps::Array<PxU32>		mSolverBodyRemapTable;				//Remaps from the "active island" index to the index within a solver island
+
+	Ps::Array<PxU32>		mNodeIndexArray;					//island node index
+
+	Ps::Array<PxsIndexedContactManager> mContactList;
+	
+	/**
+	\brief The total number of kinematic bodies in the scene
+	*/
+	PxU32						mKinematicCount;
+
+	/**
+	\brief Atomic counter for the number of threshold stream elements.
+	*/
+	PxI32						mThresholdStreamOut;
+
+	
+
+	PxsMaterialManager*			mMaterialManager;
+
+	PxsContactManagerOutputIterator mOutputIterator;
+	
+private:
+	//private:
+	PxcScratchAllocator&						mScratchAllocator;
+	Cm::FlushPool&								mTaskPool;
+	PxTaskManager*								mTaskManager;
+	PxU32										mCurrentIndex; // this is the index point to the current exceeded force threshold stream
+
+	PxU64										mContextID;
+
+	protected:
+
+	friend class PxsSolverStartTask;
+	friend class PxsSolverAticulationsTask;
+	friend class PxsSolverSetupConstraintsTask;
+	friend class PxsSolverCreateFinalizeConstraintsTask;	
+	friend class PxsSolverConstraintPartitionTask;
+	friend class PxsSolverSetupSolveTask;
+	friend class PxsSolverIntegrateTask;
+	friend class PxsSolverEndTask;
+	friend class PxsSolverConstraintPostProcessTask;
+	friend class PxsForceThresholdTask;
+	friend class SolverArticulationUpdateTask;
+
+	friend void solveParallel(SOLVER_PARALLEL_METHOD_ARGS);
+};
+
+#if PX_VC 
+    #pragma warning(pop)
+#endif
+
+}
+}
+
+#endif //DY_DYNAMICS_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionCorrelation.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionCorrelation.cpp
new file mode 100644
index 00000000..ba7c2b1d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionCorrelation.cpp
@@ -0,0 +1,276 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "PxvConfig.h"
+#include "DyCorrelationBuffer.h"
+#include "PxsMaterialManager.h"
+#include "PsUtilities.h"
+
+using namespace physx;
+using namespace Gu;
+
+namespace physx
+{
+
+namespace Dy
+{
+
+namespace 
+{
+PX_FORCE_INLINE void initContactPatch(CorrelationBuffer::ContactPatchData& patch, PxU16 index, PxReal restitution, PxReal staticFriction, PxReal dynamicFriction,
+	PxU8 flags)
+{
+	patch.start = index;
+	patch.count = 1;
+	patch.next = 0;
+	patch.flags = flags;
+	patch.restitution = restitution;
+	patch.staticFriction = staticFriction;
+	patch.dynamicFriction = dynamicFriction;
+}
+
+PX_FORCE_INLINE void initFrictionPatch(FrictionPatch& p, const PxVec3& worldNormal, const PxTransform& body0Pose, const PxTransform& body1Pose, 
+	PxReal restitution, PxReal staticFriction, PxReal dynamicFriction, PxU8 materialFlags)
+{
+	p.body0Normal = body0Pose.rotateInv(worldNormal);
+	p.body1Normal = body1Pose.rotateInv(worldNormal);
+	p.anchorCount = 0;
+	p.broken = 0;
+	p.staticFriction = staticFriction;
+	p.dynamicFriction = dynamicFriction;
+	p.restitution = restitution;
+	p.materialFlags = materialFlags;
+}
+}
+
+
+bool createContactPatches(CorrelationBuffer& fb, const Gu::ContactPoint* cb, PxU32 contactCount, PxReal normalTolerance)
+{
+
+	// PT: this rewritten version below doesn't have LHS
+
+	PxU32 contactPatchCount = fb.contactPatchCount;
+	if(contactPatchCount == Gu::ContactBuffer::MAX_CONTACTS)
+		return false;
+	if(contactCount>0)
+	{
+		CorrelationBuffer::ContactPatchData* currentPatchData = fb.contactPatches + contactPatchCount;
+		const Gu::ContactPoint* PX_RESTRICT contacts = cb;
+
+		PxU8 count=1;
+
+		initContactPatch(fb.contactPatches[contactPatchCount++], Ps::to16(0), contacts[0].restitution, 
+			contacts[0].staticFriction, contacts[0].dynamicFriction, PxU8(contacts[0].materialFlags));
+
+		PxU32 patchIndex = 0;
+
+		for (PxU32 i = 1; i<contactCount; i++)
+		{
+			const Gu::ContactPoint& curContact = contacts[i];
+			const Gu::ContactPoint& preContact = contacts[patchIndex];
+
+			if(curContact.staticFriction == preContact.staticFriction
+				&& curContact.dynamicFriction == preContact.dynamicFriction
+				&& curContact.restitution == preContact.restitution
+				&& curContact.normal.dot(preContact.normal)>=normalTolerance)
+			{
+				count++;
+			}
+			else
+			{
+				if(contactPatchCount == Gu::ContactBuffer::MAX_CONTACTS)
+					return false;
+				patchIndex = i;
+				currentPatchData->count = count;
+				count = 1;
+				currentPatchData = fb.contactPatches + contactPatchCount;
+
+				initContactPatch(fb.contactPatches[contactPatchCount++], Ps::to16(i), curContact.restitution,
+					curContact.staticFriction, curContact.dynamicFriction, PxU8(curContact.materialFlags));
+			}
+		}
+		if(count!=1)
+			currentPatchData->count = count;
+	}
+	fb.contactPatchCount = contactPatchCount;
+	return true;
+}
+
+bool correlatePatches(CorrelationBuffer& fb, 
+					  const Gu::ContactPoint* cb,
+					  const PxTransform& bodyFrame0,
+					  const PxTransform& bodyFrame1,
+					  PxReal normalTolerance,
+					  PxU32 startContactPatchIndex,
+					  PxU32 startFrictionPatchIndex)
+{
+	bool overflow = false;
+	PxU32 frictionPatchCount = fb.frictionPatchCount;
+
+	for(PxU32 i=startContactPatchIndex;i<fb.contactPatchCount;i++)
+	{
+		CorrelationBuffer::ContactPatchData &c = fb.contactPatches[i];
+		const PxVec3 patchNormal = cb[c.start].normal;
+
+		PxU32 j=startFrictionPatchIndex;
+		for(;j<frictionPatchCount && ((patchNormal.dot(fb.frictionPatchWorldNormal[j]) < normalTolerance) 
+			|| fb.frictionPatches[j].restitution != c.restitution|| fb.frictionPatches[j].staticFriction != c.staticFriction || 
+			fb.frictionPatches[j].dynamicFriction != c.dynamicFriction);j++)
+			;
+
+		if(j==frictionPatchCount)
+		{
+			overflow |= j==CorrelationBuffer::MAX_FRICTION_PATCHES;
+			if(overflow)
+				continue;
+
+			initFrictionPatch(fb.frictionPatches[frictionPatchCount], patchNormal, bodyFrame0, bodyFrame1, c.restitution, c.staticFriction, c.dynamicFriction, c.flags);
+			fb.frictionPatchWorldNormal[j] = patchNormal;
+			fb.frictionPatchContactCounts[frictionPatchCount] = c.count;
+			fb.contactID[frictionPatchCount][0] = 0xffff;
+			fb.contactID[frictionPatchCount++][1] = 0xffff;
+			c.next = CorrelationBuffer::LIST_END;
+		}
+		else
+		{
+			fb.frictionPatchContactCounts[j] += c.count;
+			c.next = Ps::to16(fb.correlationListHeads[j]);
+		}
+
+		fb.correlationListHeads[j] = i;
+	}
+
+	fb.frictionPatchCount = frictionPatchCount;
+
+	return overflow;
+}
+
+// run over the friction patches, trying to find two anchors per patch. If we already have
+// anchors that are close, we keep them, which gives us persistent spring behavior
+
+void growPatches(CorrelationBuffer& fb,
+				 const ContactPoint* cb,
+				 const PxTransform& bodyFrame0,
+				 const PxTransform& bodyFrame1,
+				 PxReal	,	//unused correlationDistance
+				 PxU32 frictionPatchStartIndex,
+				 PxReal frictionOffsetThreshold)
+{
+	for(PxU32 i=frictionPatchStartIndex;i<fb.frictionPatchCount;i++)
+	{
+		FrictionPatch& fp = fb.frictionPatches[i];
+
+		if(fp.anchorCount==2 || fb.correlationListHeads[i]==CorrelationBuffer::LIST_END)
+			continue;
+
+		PxVec3 worldAnchors[2];
+		PxU16 anchorCount = 0;
+		PxReal pointDistSq = 0.0f, dist0, dist1;
+
+		// if we have an anchor already, keep it
+		if(fp.anchorCount == 1)
+		{
+			worldAnchors[anchorCount++] = bodyFrame0.transform(fp.body0Anchors[0]);
+		}
+
+		for(PxU32 patch = fb.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = fb.contactPatches[patch].next)
+		{
+			CorrelationBuffer::ContactPatchData& cp = fb.contactPatches[patch];
+			for(PxU16 j=0;j<cp.count;j++)
+			{
+				const PxVec3& worldPoint = cb[cp.start+j].point;
+
+				if(cb[cp.start+j].separation < frictionOffsetThreshold)
+				{
+
+					switch(anchorCount)
+					{
+					case 0:
+						fb.contactID[i][0] = PxU16(cp.start+j);
+						worldAnchors[0] = worldPoint;
+						anchorCount++;
+						break;
+					case 1:
+						pointDistSq = (worldPoint-worldAnchors[0]).magnitudeSquared(); 
+						if (pointDistSq > (0.025f * 0.025f))
+						{
+							fb.contactID[i][1] = PxU16(cp.start+j);
+							worldAnchors[1] = worldPoint;
+							anchorCount++;
+						}
+						break;
+					default: //case 2
+						dist0 = (worldPoint-worldAnchors[0]).magnitudeSquared();
+						dist1 = (worldPoint-worldAnchors[1]).magnitudeSquared();
+						if (dist0 > dist1)
+						{
+							if(dist0 > pointDistSq)
+							{
+								fb.contactID[i][1] = PxU16(cp.start+j);
+								worldAnchors[1] = worldPoint;
+								pointDistSq = dist0;
+							}
+						}
+						else if (dist1 > pointDistSq)
+						{
+							fb.contactID[i][0] = PxU16(cp.start+j);
+							worldAnchors[0] = worldPoint;
+							pointDistSq = dist1;
+						}
+					}
+				}
+			}
+		}
+
+		//PX_ASSERT(anchorCount > 0);
+
+		// add the new anchor(s) to the patch
+		for(PxU32 j = fp.anchorCount; j < anchorCount; j++)
+		{
+			fp.body0Anchors[j] = bodyFrame0.transformInv(worldAnchors[j]);
+			fp.body1Anchors[j] = bodyFrame1.transformInv(worldAnchors[j]);
+		}
+
+		// the block contact solver always reads at least one anchor per patch for performance reasons even if there are no valid patches,
+		// so we need to initialize this in the unexpected case that we have no anchors
+
+		if(anchorCount==0)
+			fp.body0Anchors[0] = fp.body1Anchors[0] = PxVec3(0);
+
+		fp.anchorCount = anchorCount;
+	}
+}
+
+}
+
+}
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionPatch.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionPatch.h
new file mode 100644
index 00000000..507e7f12
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionPatch.h
@@ -0,0 +1,81 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef PXC_FRICTIONPATCH_H
+#define PXC_FRICTIONPATCH_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+struct FrictionPatch
+{
+	PxU8				broken;				// PT: must be first byte of struct, see "frictionBrokenWritebackByte"
+	PxU8				materialFlags;
+	PxU16				anchorCount;
+	PxReal				restitution;
+	PxReal				staticFriction;
+	PxReal				dynamicFriction;
+	PxVec3				body0Normal;
+	PxVec3				body1Normal;
+	PxVec3				body0Anchors[2];
+	PxVec3				body1Anchors[2];
+
+	PX_FORCE_INLINE	void	operator = (const FrictionPatch& other)
+	{
+		broken = other.broken;
+		materialFlags = other.materialFlags;
+		anchorCount = other.anchorCount;
+		body0Normal = other.body0Normal;
+		body1Normal = other.body1Normal;
+		body0Anchors[0] = other.body0Anchors[0];   
+		body0Anchors[1] = other.body0Anchors[1];
+		body1Anchors[0] = other.body1Anchors[0];
+		body1Anchors[1] = other.body1Anchors[1];
+		restitution = other.restitution;
+		staticFriction = other.staticFriction;
+		dynamicFriction = other.dynamicFriction;
+	}
+};  
+
+//PX_COMPILE_TIME_ASSERT(sizeof(FrictionPatch)==80);
+
+}
+
+}
+
+#endif
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionPatchStreamPair.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionPatchStreamPair.h
new file mode 100644
index 00000000..8219918f
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyFrictionPatchStreamPair.h
@@ -0,0 +1,128 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef PXC_FRICTIONPATCHPOOL_H
+#define PXC_FRICTIONPATCHPOOL_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxvConfig.h"
+#include "PsMutex.h"
+#include "PsArray.h"
+
+// Each narrow phase thread has an input stream of friction patches from the
+// previous frame and an output stream of friction patches which will be
+// saved for next frame. The patches persist for exactly one frame at which
+// point they get thrown away.
+
+
+// There is a stream pair per thread. A contact callback reserves space
+// for its friction patches and gets a cookie in return that can stash
+// for next frame. Cookies are valid for one frame only.
+//
+// note that all friction patches reserved are guaranteed to be contiguous;
+// this might turn out to be a bit inefficient if we often have a large
+// number of friction patches
+
+#include "PxcNpMemBlockPool.h"
+
+namespace physx
+{
+
+class FrictionPatchStreamPair
+{
+public:
+	FrictionPatchStreamPair(PxcNpMemBlockPool& blockPool);
+
+	// reserve can fail and return null. Read should never fail
+	template<class FrictionPatch>
+	FrictionPatch*		reserve(const PxU32 size);
+
+	template<class FrictionPatch>
+	const FrictionPatch* findInputPatches(const PxU8* ptr) const;
+	void					reset();
+
+	PxcNpMemBlockPool& getBlockPool() { return mBlockPool;}
+private:
+	PxcNpMemBlockPool&	mBlockPool;
+	PxcNpMemBlock*		mBlock;
+	PxU32				mUsed;
+
+	FrictionPatchStreamPair& operator=(const FrictionPatchStreamPair&);
+};
+
+PX_FORCE_INLINE FrictionPatchStreamPair::FrictionPatchStreamPair(PxcNpMemBlockPool& blockPool):
+  mBlockPool(blockPool), mBlock(NULL), mUsed(0)
+{
+}
+
+PX_FORCE_INLINE void FrictionPatchStreamPair::reset()
+{
+	mBlock = NULL;
+	mUsed = 0;
+}
+
+// reserve can fail and return null. Read should never fail
+template <class FrictionPatch>
+FrictionPatch* FrictionPatchStreamPair::reserve(const PxU32 size)
+{
+	if(size>PxcNpMemBlock::SIZE)
+	{
+		return reinterpret_cast<FrictionPatch*>(-1);
+	}
+
+	PX_ASSERT(size <= PxcNpMemBlock::SIZE);
+
+	FrictionPatch* ptr = NULL;
+
+	if(mBlock == NULL || mUsed + size > PxcNpMemBlock::SIZE)
+	{
+		mBlock = mBlockPool.acquireFrictionBlock();
+		mUsed = 0;
+	}
+
+	if(mBlock)
+	{
+		ptr = reinterpret_cast<FrictionPatch*>(mBlock->data+mUsed);
+		mUsed += size;
+	}
+
+	return ptr;
+}
+
+template <class FrictionPatch>
+const FrictionPatch* FrictionPatchStreamPair::findInputPatches(const PxU8* ptr) const
+{
+	return reinterpret_cast<const FrictionPatch*>(ptr);
+}
+
+}
+
+#endif
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyRigidBodyToSolverBody.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyRigidBodyToSolverBody.cpp
new file mode 100644
index 00000000..c6c66e8a
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyRigidBodyToSolverBody.cpp
@@ -0,0 +1,107 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "CmUtils.h"
+#include "DySolverBody.h"
+#include "PxsRigidBody.h"
+#include "PxvDynamics.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+//This method returns values of 0 when the inertia is 0. This is a bit of a hack but allows us to 
+//represent kinematic objects' velocities in our new format
+PX_FORCE_INLINE PxVec3 computeSafeSqrtInertia(const PxVec3& v)
+{
+	return PxVec3(v.x == 0.f ? 0.f : PxSqrt(v.x), v.y == 0.f ? 0.f : PxSqrt(v.y), v.z == 0.f ? 0.f : PxSqrt(v.z));
+}
+
+void copyToSolverBodyData(const PxVec3& linearVelocity, const PxVec3& angularVelocity, const PxReal invMass, const PxVec3& invInertia, const PxTransform& globalPose,
+	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& data, PxU32 lockFlags)
+{
+	data.nodeIndex = nodeIndex;
+
+	PxVec3 safeSqrtInvInertia = computeSafeSqrtInertia(invInertia);
+
+	PxMat33 rotation(globalPose.q);
+
+	Cm::transformInertiaTensor(safeSqrtInvInertia, rotation, data.sqrtInvInertia);
+
+	// Copy simple properties
+	data.linearVelocity = linearVelocity;
+	data.angularVelocity = angularVelocity;
+
+	if (lockFlags)
+	{
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X)
+			data.linearVelocity.x = 0.f;
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y)
+			data.linearVelocity.y = 0.f;
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z)
+			data.linearVelocity.z = 0.f;
+
+		//KS - technically, we can zero the inertia columns and produce stiffer constraints. However, this can cause numerical issues with the 
+		//joint solver, which is fixed by disabling joint preprocessing and setting minResponseThreshold to some reasonable value > 0. However, until
+		//this is handled automatically, it's probably better not to zero these inertia rows
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X)
+		{
+			data.angularVelocity.x = 0.f;
+			//data.sqrtInvInertia.column0 = PxVec3(0.f);
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y)
+		{
+			data.angularVelocity.y = 0.f;
+			//data.sqrtInvInertia.column1 = PxVec3(0.f);
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z)
+		{
+			data.angularVelocity.z = 0.f;
+			//data.sqrtInvInertia.column2 = PxVec3(0.f);
+		}
+	}
+
+
+	PX_ASSERT(linearVelocity.isFinite());
+	PX_ASSERT(angularVelocity.isFinite());
+
+	data.invMass = invMass;
+	data.penBiasClamp = maxDepenetrationVelocity;
+	data.maxContactImpulse = maxContactImpulse;
+	data.body2World = globalPose;
+	data.lockFlags = lockFlags;
+
+	data.reportThreshold = reportThreshold;
+}
+
+}
+
+}
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverBody.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverBody.h
new file mode 100644
index 00000000..566f1ca1
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverBody.h
@@ -0,0 +1,60 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERATOM_H
+#define DY_SOLVERATOM_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxMat33.h"
+#include "CmPhysXCommon.h"
+#include "CmSpatialVector.h"
+#include "solver/PxSolverDefs.h"
+
+namespace physx
+{
+
+class PxsRigidBody;
+struct PxsBodyCore;
+
+namespace Dy
+{
+
+//void copyToSolverBodyData(PxSolverBodyData& data, const PxsBodyCore& core, const PxU32 nodeIndex);
+
+
+void copyToSolverBodyData(const PxVec3& linearVelocity, const PxVec3& angularVelocity, const PxReal invMass, const PxVec3& invInertia, const PxTransform& globalPose,
+	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& solverBodyData, PxU32 lockFlags);
+
+}
+
+}
+
+#endif //DY_SOLVERATOM_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraint1D.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraint1D.h
new file mode 100644
index 00000000..4291530b
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraint1D.h
@@ -0,0 +1,203 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVER_CONSTRAINT_1D_H
+#define DY_SOLVER_CONSTRAINT_1D_H
+
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "DyArticulationUtils.h"
+#include "DySolverConstraintTypes.h"
+#include "DySolverBody.h"
+#include "PxConstraintDesc.h"
+#include "DySolverConstraintDesc.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+// dsequeira: we should probably fork these structures for constraints and extended constraints,
+// since there's a few things that are used for one but not the other
+
+struct SolverConstraint1DHeader
+{
+	PxU8	type;			// enum SolverConstraintType - must be first byte
+	PxU8	count;			// count of following 1D constraints
+	PxU8	dominance;
+	PxU8	breakable;		// indicate whether this constraint is breakable or not						
+
+	PxReal	linBreakImpulse;
+	PxReal	angBreakImpulse;
+	PxReal	invMass0D0;
+	PxVec3	body0WorldOffset;
+	PxReal	invMass1D1;
+	PxReal	linearInvMassScale0;		// only used by articulations
+	PxReal	angularInvMassScale0;		// only used by articulations
+	PxReal	linearInvMassScale1;		// only used by articulations
+	PxReal	angularInvMassScale1;		// only used by articulations
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DHeader) == 48);
+
+PX_ALIGN_PREFIX(16)
+struct SolverConstraint1D 
+{
+public:
+	PxVec3		lin0;					//!< linear velocity projection (body 0)	
+	PxReal		constant;				//!< constraint constant term
+
+	PxVec3		lin1;					//!< linear velocity projection (body 1)
+	PxReal		unbiasedConstant;		//!< constraint constant term without bias
+
+	PxVec3		ang0;					//!< angular velocity projection (body 0)
+	PxReal		velMultiplier;			//!< constraint velocity multiplier
+
+	PxVec3		ang1;					//!< angular velocity projection (body 1)
+	PxReal		impulseMultiplier;		//!< constraint impulse multiplier
+
+	PxVec3		ang0Writeback;			//!< unscaled angular velocity projection (body 0)
+	PxU32		pad;
+
+	PxReal		minImpulse;				//!< Lower bound on impulse magnitude	 
+	PxReal		maxImpulse;				//!< Upper bound on impulse magnitude
+	PxReal		appliedForce;			//!< applied force to correct velocity+bias
+	PxU32		flags;
+} PX_ALIGN_SUFFIX(16); 	
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1D) == 96);
+
+
+struct SolverConstraint1DExt : public SolverConstraint1D
+{
+public:
+	Cm::SpatialVectorV deltaVA;
+	Cm::SpatialVectorV deltaVB;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DExt) == 160);
+
+
+PX_FORCE_INLINE void init(SolverConstraint1DHeader& h, 
+						  PxU8 count, 
+						  bool isExtended,
+						  const PxConstraintInvMassScale& ims)
+{
+	h.type			= PxU8(isExtended ? DY_SC_TYPE_EXT_1D : DY_SC_TYPE_RB_1D);
+	h.count			= count;
+	h.dominance		= 0;
+	h.linearInvMassScale0	= ims.linear0;
+	h.angularInvMassScale0	= ims.angular0;
+	h.linearInvMassScale1	= -ims.linear1;
+	h.angularInvMassScale1	= -ims.angular1;
+}
+
+PX_FORCE_INLINE void init(SolverConstraint1D& c,
+						  const PxVec3& _linear0, const PxVec3& _linear1, 
+						  const PxVec3& _angular0, const PxVec3& _angular1,
+						  PxReal _minImpulse, PxReal _maxImpulse)
+{
+	PX_ASSERT(_linear0.isFinite());
+	PX_ASSERT(_linear1.isFinite());
+	c.lin0					= _linear0;
+	c.lin1					= _linear1;
+	c.ang0					= _angular0;
+	c.ang1					= _angular1;
+	c.minImpulse			= _minImpulse;
+	c.maxImpulse			= _maxImpulse;
+	c.flags					= 0;
+	c.appliedForce			= 0;
+}
+
+PX_FORCE_INLINE bool needsNormalVel(const Px1DConstraint &c)
+{
+	return c.flags & Px1DConstraintFlag::eRESTITUTION
+		|| (c.flags & Px1DConstraintFlag::eSPRING && c.flags & Px1DConstraintFlag::eACCELERATION_SPRING);
+}
+
+PX_FORCE_INLINE void setSolverConstants(PxReal& constant,
+										PxReal& unbiasedConstant,
+										PxReal& velMultiplier,
+										PxReal& impulseMultiplier,
+										const Px1DConstraint& c,
+										PxReal normalVel,
+										PxReal unitResponse,
+										PxReal minRowResponse,
+										PxReal erp,
+										PxReal dt,
+										PxReal recipdt)
+{
+	PX_ASSERT(PxIsFinite(unitResponse));
+	PxReal recipResponse = unitResponse <= minRowResponse ? 0 : 1.0f/unitResponse;
+	PxReal geomError = c.geometricError * erp;
+
+	if(c.flags & Px1DConstraintFlag::eSPRING)
+	{
+		PxReal a = dt * dt * c.mods.spring.stiffness + dt * c.mods.spring.damping;
+		PxReal b = dt * (c.mods.spring.damping * c.velocityTarget - c.mods.spring.stiffness * geomError);
+
+		if(c.flags & Px1DConstraintFlag::eACCELERATION_SPRING)
+		{	
+			PxReal x = 1.0f/(1.0f+a);
+			constant = unbiasedConstant = x * recipResponse * b;
+			velMultiplier = -x * recipResponse * a;
+			impulseMultiplier = 1.0f-x;
+		}
+		else
+		{
+			PxReal x = 1.0f/(1.0f+a*unitResponse);
+			constant = unbiasedConstant = x * b;
+			velMultiplier = -x*a;
+			impulseMultiplier = 1.0f-x;
+		}
+	}
+	else
+	{
+		velMultiplier = -recipResponse;
+		impulseMultiplier = 1.0f;
+
+		if(c.flags & Px1DConstraintFlag::eRESTITUTION && -normalVel>c.mods.bounce.velocityThreshold)
+		{
+			unbiasedConstant = constant = recipResponse * c.mods.bounce.restitution*-normalVel;
+		}
+		else
+		{
+			// see usage of 'for internal use' in preprocessRows()
+			constant = recipResponse * (c.velocityTarget - geomError*recipdt);
+			unbiasedConstant = recipResponse * (c.velocityTarget - c.forInternalUse*recipdt);
+		}
+	}
+}
+
+}
+}
+
+#endif //DY_SOLVER_CONSTRAINT_1D_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraint1D4.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraint1D4.h
new file mode 100644
index 00000000..833f7934
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraint1D4.h
@@ -0,0 +1,106 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVERCONSTRAINT1D4_H
+#define DY_SOLVERCONSTRAINT1D4_H
+
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "DyArticulationUtils.h"
+#include "DySolverConstraint1D.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+struct SolverConstraint1DHeader4
+{
+	PxU8	type;			// enum SolverConstraintType - must be first byte
+	PxU8	pad0[3];	
+	//These counts are the max of the 4 sets of data.
+	//When certain pairs have fewer constraints than others, they are padded with 0s so that no work is performed but 
+	//calculations are still shared (afterall, they're computationally free because we're doing 4 things at a time in SIMD)
+	PxU32	count;
+	PxU8	count0, count1, count2, count3;
+	PxU8	break0, break1, break2, break3;
+
+	Vec4V	linBreakImpulse;
+	Vec4V	angBreakImpulse;
+	Vec4V	invMass0D0;
+	Vec4V	invMass1D1;
+	Vec4V	angD0;
+	Vec4V	angD1;
+
+	Vec4V	body0WorkOffsetX;
+	Vec4V	body0WorkOffsetY;
+	Vec4V	body0WorkOffsetZ;
+};
+
+struct SolverConstraint1DBase4 
+{
+public:
+	Vec4V		lin0X;
+	Vec4V		lin0Y;
+	Vec4V		lin0Z;
+	Vec4V		ang0X;
+	Vec4V		ang0Y;
+	Vec4V		ang0Z;
+	Vec4V		ang0WritebackX;
+	Vec4V		ang0WritebackY;
+	Vec4V		ang0WritebackZ;
+	Vec4V		constant;
+	Vec4V		unbiasedConstant;
+	Vec4V		velMultiplier;
+	Vec4V		impulseMultiplier;
+	Vec4V		minImpulse;
+	Vec4V		maxImpulse;
+	Vec4V		appliedForce;
+	PxU32		flags[4];
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DBase4) == 272);
+
+struct SolverConstraint1DDynamic4 : public SolverConstraint1DBase4
+{
+	Vec4V		lin1X;
+	Vec4V		lin1Y;
+	Vec4V		lin1Z;
+	Vec4V		ang1X;
+	Vec4V		ang1Y;
+	Vec4V		ang1Z;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DDynamic4) == 368);
+
+}
+
+}
+
+#endif //DY_SOLVERCONSTRAINT1D4_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintDesc.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintDesc.h
new file mode 100644
index 00000000..e74b0374
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintDesc.h
@@ -0,0 +1,141 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONSTRAINTDESC_H
+#define DY_SOLVERCONSTRAINTDESC_H
+
+#include "PxvConfig.h"
+#include "DySolverConstraintTypes.h"
+#include "PsUtilities.h"
+#include "PxConstraintDesc.h"
+#include "solver/PxSolverDefs.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+
+struct PxsContactManagerOutput;
+
+namespace Cm
+{
+	class SpatialVector;
+}
+
+struct PxSolverBody;
+struct PxSolverBodyData;
+
+namespace Dy
+{
+
+struct FsData;
+
+
+
+
+// dsequeira: moved this articulation stuff here to sever a build dep on Articulation.h through DyThreadContext.h and onward
+
+struct SelfConstraintBlock
+{
+	PxU32	startId;				
+	PxU32	numSelfConstraints;	
+	PxU16	fsDataLength;		
+	PxU16	requiredSolverProgress;
+	uintptr_t eaFsData;
+};
+
+//This class rolls together multiple contact managers into a single contact manager.
+struct CompoundContactManager
+{
+	PxU32 mStartIndex;
+	PxU16 mStride;
+	PxU16 mReducedContactCount;
+
+	PxcNpWorkUnit* unit;			//This is a work unit but the contact buffer has been adjusted to contain all the contacts for all the subsequent pairs
+	PxsContactManagerOutput* cmOutput;
+	PxU8* originalContactPatches;	//This is the original contact buffer that we replaced with a combined buffer	
+	PxU8* originalContactPoints;
+	PxU8 originalContactCount;
+	PxU8 originalPatchCount;
+	PxU8 originalStatusFlags;
+	PxReal* originalForceBuffer;	//This is the original force buffer that we replaced with a combined force buffer
+	PxU16* forceBufferList;			//This is a list of indices from the reduced force buffer to the original force buffers - we need this to fix up the write-backs from the solver	
+};
+
+struct SolverConstraintPrepState
+{
+enum Enum 
+{
+	eOUT_OF_MEMORY,
+	eUNBATCHABLE,
+	eSUCCESS
+};
+};
+
+PX_FORCE_INLINE bool isArticulationConstraint(const PxSolverConstraintDesc& desc)
+{
+	return desc.linkIndexA != PxSolverConstraintDesc::NO_LINK || 
+		desc.linkIndexB != PxSolverConstraintDesc::NO_LINK;
+}
+
+
+PX_FORCE_INLINE void setConstraintLength(PxSolverConstraintDesc& desc, const PxU32 constraintLength)
+{
+	PX_ASSERT(0==(constraintLength & 0x0f));
+	PX_ASSERT(constraintLength <= PX_MAX_U16 * 16);
+	desc.constraintLengthOver16 = Ps::to16(constraintLength >> 4);
+}
+
+PX_FORCE_INLINE void setWritebackLength(PxSolverConstraintDesc& desc, const PxU32 writeBackLength)
+{
+	PX_ASSERT(0==(writeBackLength & 0x03));
+	PX_ASSERT(writeBackLength <= PX_MAX_U16 * 4);
+	desc.writeBackLengthOver4 = Ps::to16(writeBackLength >> 2);
+}
+
+PX_FORCE_INLINE PxU32 getConstraintLength(const PxSolverConstraintDesc& desc)
+{
+	return PxU32(desc.constraintLengthOver16 << 4);
+}
+
+PX_FORCE_INLINE PxU32 getWritebackLength(const PxSolverConstraintDesc& desc)
+{
+	return PxU32(desc.writeBackLengthOver4 << 2);
+}
+
+PX_COMPILE_TIME_ASSERT(0 == (0x0f & sizeof(PxSolverConstraintDesc)));
+
+#define MAX_PERMITTED_SOLVER_PROGRESS 0xFFFF
+
+}
+
+}
+
+#endif //DY_SOLVERCONSTRAINTDESC_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintExtShared.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintExtShared.h
new file mode 100644
index 00000000..2c2f59f9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintExtShared.h
@@ -0,0 +1,116 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVER_CONSTRAINT_EXT_SHARED_H
+#define DY_SOLVER_CONSTRAINT_EXT_SHARED_H
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "DyArticulationContactPrep.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DyArticulationHelper.h"
+#include "PxcNpWorkUnit.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+
+namespace physx
+{
+namespace Dy
+{
+	PX_FORCE_INLINE void setupExtSolverContact(const SolverExtBody& b0, const SolverExtBody& b1, 
+		const PxF32 d0, const PxF32 d1, const PxF32 angD0, const PxF32 angD1, const PxTransform& bodyFrame0, const PxTransform& bodyFrame1,
+		const Vec3VArg normal, const FloatVArg invDt, const FloatVArg invDtp8, const FloatVArg restDistance, const FloatVArg maxPenBias,  const FloatVArg restitution,
+		const FloatVArg bounceThreshold, const Gu::ContactPoint& contact, SolverContactPointExt& solverContact, const FloatVArg ccdMaxSeparation)
+	{
+		const FloatV zero = FZero();
+		const FloatV separation = FLoad(contact.separation);
+
+		const FloatV penetration = FSub(separation, restDistance);
+
+		const PxVec3 ra = contact.point - bodyFrame0.p;
+		const PxVec3 rb = contact.point - bodyFrame1.p;
+
+		const PxVec3 raXn = ra.cross(contact.normal);
+		const PxVec3 rbXn = rb.cross(contact.normal);
+
+		Cm::SpatialVector deltaV0, deltaV1;
+
+		const Cm::SpatialVector resp0 = createImpulseResponseVector(contact.normal, raXn, b0);
+		const Cm::SpatialVector resp1 = createImpulseResponseVector(-contact.normal, -rbXn, b1);
+
+		const FloatV unitResponse = FLoad(getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+															 b1, resp1, deltaV1, d1, angD1));
+
+		const FloatV vel0 = FLoad(b0.projectVelocity(contact.normal, raXn));
+		const FloatV vel1 = FLoad(b1.projectVelocity(contact.normal, rbXn));
+		const FloatV vrel = FSub(vel0, vel1);
+
+		FloatV velMultiplier = FSel(FIsEq(unitResponse, zero), zero, FRecip(unitResponse));
+		FloatV scaledBias = FMul(velMultiplier, FMax(maxPenBias, FMul(penetration, invDtp8)));
+		const FloatV penetrationInvDt = FMul(penetration, invDt);
+
+		const BoolV isGreater2 = BAnd(BAnd(FIsGrtr(restitution, zero), FIsGrtr(bounceThreshold, vrel)), FIsGrtr(FNeg(vrel), penetrationInvDt));
+
+		const BoolV ccdSeparationCondition = FIsGrtrOrEq(ccdMaxSeparation, penetration);
+
+		scaledBias = FSel(BAnd(ccdSeparationCondition, isGreater2), zero, scaledBias);
+
+		FloatV targetVelocity = FSel(isGreater2, FMul(FNeg(vrel), restitution), zero);
+
+		//Get the rigid body's current velocity and embed into the constraint target velocities
+		if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+			targetVelocity = FSub(targetVelocity, vel0);
+		else if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+			targetVelocity = FAdd(targetVelocity, vel1);
+
+		targetVelocity = FAdd(targetVelocity, V3Dot(V3LoadA(contact.targetVel), normal));
+
+		const FloatV biasedErr = FScaleAdd(targetVelocity, velMultiplier, FNeg(scaledBias));
+		const FloatV unbiasedErr = FScaleAdd(targetVelocity, velMultiplier, FSel(isGreater2, zero, FNeg(FMax(scaledBias, zero))));
+
+
+		FStore(velMultiplier, &solverContact.velMultiplier);
+		FStore(biasedErr, &solverContact.biasedErr);
+		FStore(unbiasedErr, &solverContact.unbiasedErr);
+		solverContact.maxImpulse = contact.maxImpulse;
+
+		solverContact.raXn = V3LoadA(resp0.angular);
+		solverContact.rbXn = V3Neg(V3LoadA(resp1.angular));
+		solverContact.linDeltaVA = V3LoadA(deltaV0.linear);
+		solverContact.angDeltaVA = V3LoadA(deltaV0.angular);
+		solverContact.linDeltaVB = V3LoadA(deltaV1.linear);
+		solverContact.angDeltaVB = V3LoadA(deltaV1.angular);
+	}
+}
+}
+
+#endif //DY_SOLVER_CONSTRAINT_EXT_SHARED_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintTypes.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintTypes.h
new file mode 100644
index 00000000..2b13c190
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintTypes.h
@@ -0,0 +1,67 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONSTRAINTTYPES_H
+#define DY_SOLVERCONSTRAINTTYPES_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxvConfig.h"
+
+namespace physx
+{
+
+enum SolverConstraintType
+{
+	DY_SC_TYPE_NONE = 0,
+	DY_SC_TYPE_RB_CONTACT,		// RB-only contact
+	DY_SC_TYPE_RB_1D,			// RB-only 1D constraint
+	DY_SC_TYPE_EXT_CONTACT,	// contact involving articulations
+	DY_SC_TYPE_EXT_1D,			// 1D constraint involving articulations
+	DY_SC_TYPE_STATIC_CONTACT,	// RB-only contact where body b is static
+	DY_SC_TYPE_NOFRICTION_RB_CONTACT, //RB-only contact with no friction patch
+	DY_SC_TYPE_BLOCK_RB_CONTACT,
+	DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT,
+	DY_SC_TYPE_BLOCK_1D,
+	DY_SC_TYPE_FRICTION,
+	DY_SC_TYPE_STATIC_FRICTION,
+	DY_SC_TYPE_EXT_FRICTION,
+	DY_SC_TYPE_BLOCK_FRICTION,
+	DY_SC_TYPE_BLOCK_STATIC_FRICTION,
+	DY_SC_CONSTRAINT_TYPE_COUNT //Count of the number of different constraint types in the solver
+};
+
+enum SolverConstraintFlags
+{
+	DY_SC_FLAG_OUTPUT_FORCE		= (1<<1)
+};
+
+}
+
+#endif //DY_SOLVERCONSTRAINTTYPES_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraints.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraints.cpp
new file mode 100644
index 00000000..ea935ce9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraints.cpp
@@ -0,0 +1,1121 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+
+#ifdef PX_SUPPORT_SIMD
+
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContact.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "DyThresholdTable.h"
+#include "DySolverContext.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+#include "DySolverConstraintsShared.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+//Port of scalar implementation to SIMD maths with some interleaving of instructions
+void solve1D(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	PX_UNUSED(cache);
+	PxSolverBody& b0 = *desc.bodyA;
+	PxSolverBody& b1 = *desc.bodyB;
+
+	PxU8* PX_RESTRICT bPtr = desc.constraint;
+	//PxU32 length = desc.constraintLength;
+
+	const SolverConstraint1DHeader* PX_RESTRICT  header = reinterpret_cast<const SolverConstraint1DHeader*>(bPtr);
+	SolverConstraint1D* PX_RESTRICT base = reinterpret_cast<SolverConstraint1D*>(bPtr + sizeof(SolverConstraint1DHeader));
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V linVel1 = V3LoadA(b1.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+	Vec3V angState1 = V3LoadA(b1.angularState);
+
+	const FloatV invMass0 = FLoad(header->invMass0D0);
+	const FloatV invMass1 = FLoad(header->invMass1D1);
+	const FloatV invInertiaScale0 = FLoad(header->angularInvMassScale0);
+	const FloatV invInertiaScale1 = FLoad(header->angularInvMassScale1);
+
+
+	for(PxU32 i=0; i<header->count;++i, base++)
+	{
+		Ps::prefetchLine(base+1);
+		SolverConstraint1D& c = *base;
+
+		const Vec3V clinVel0 = V3LoadA(c.lin0);
+		const Vec3V clinVel1 = V3LoadA(c.lin1);
+		const Vec3V cangVel0 = V3LoadA(c.ang0);
+		const Vec3V cangVel1 = V3LoadA(c.ang1);
+
+		const FloatV constant = FLoad(c.constant);
+		const FloatV vMul = FLoad(c.velMultiplier);
+		const FloatV iMul = FLoad(c.impulseMultiplier);
+		const FloatV appliedForce = FLoad(c.appliedForce);
+		//const FloatV targetVel = FLoad(c.targetVelocity);
+		
+		const FloatV maxImpulse = FLoad(c.maxImpulse);
+		const FloatV minImpulse = FLoad(c.minImpulse);
+
+		const Vec3V v0 = V3MulAdd(linVel0, clinVel0, V3Mul(angState0, cangVel0));
+		const Vec3V v1 = V3MulAdd(linVel1, clinVel1, V3Mul(angState1, cangVel1));
+
+		const FloatV normalVel = V3SumElems(V3Sub(v0, v1));
+		const FloatV unclampedForce = FScaleAdd(iMul, appliedForce, FScaleAdd(vMul, normalVel, constant));
+		const FloatV clampedForce = FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
+		const FloatV deltaF = FSub(clampedForce, appliedForce);
+			
+		FStore(clampedForce, &c.appliedForce);
+		linVel0 = V3ScaleAdd(clinVel0, FMul(deltaF, invMass0), linVel0);			
+		linVel1 = V3NegScaleSub(clinVel1, FMul(deltaF, invMass1), linVel1);
+		angState0 = V3ScaleAdd(cangVel0, FMul(deltaF, invInertiaScale0), angState0);
+		//This should be negScaleSub but invInertiaScale1 is negated already
+		angState1 = V3ScaleAdd(cangVel1, FMul(deltaF, invInertiaScale1), angState1);
+
+	}
+
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+	V3StoreA(linVel1, b1.linearVelocity);
+	V3StoreA(angState1, b1.angularState);
+	
+	PX_ASSERT(b0.linearVelocity.isFinite());
+	PX_ASSERT(b0.angularState.isFinite());
+	PX_ASSERT(b1.linearVelocity.isFinite());
+	PX_ASSERT(b1.angularState.isFinite());
+}
+
+void conclude1D(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
+	PxU8* base = desc.constraint + sizeof(SolverConstraint1DHeader);
+	PxU32 stride = header->type == DY_SC_TYPE_EXT_1D ? sizeof(SolverConstraint1DExt) : sizeof(SolverConstraint1D);
+
+	for(PxU32 i=0; i<header->count; i++)
+	{
+		SolverConstraint1D& c = *reinterpret_cast<SolverConstraint1D*>(base);
+
+		c.constant = c.unbiasedConstant;
+
+		base += stride;
+	}
+	PX_ASSERT(desc.constraint + getConstraintLength(desc) == base);
+}
+
+// ==============================================================
+
+void solveContact(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+	PxSolverBody& b1 = *desc.bodyB;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V linVel1 = V3LoadA(b1.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+	Vec3V angState1 = V3LoadA(b1.angularState);
+
+	const PxU8* PX_RESTRICT last = desc.constraint + getConstraintLength(desc);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	while(currPtr < last)
+	{
+		SolverContactHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeader*>(currPtr);
+		currPtr += sizeof(SolverContactHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		SolverContactPoint* PX_RESTRICT contacts = reinterpret_cast<SolverContactPoint*>(currPtr);
+		Ps::prefetchLine(contacts);
+		currPtr += numNormalConstr * sizeof(SolverContactPoint);
+
+		PxF32* forceBuffer = reinterpret_cast<PxF32*>(currPtr);
+		currPtr += sizeof(PxF32) * ((numNormalConstr + 3) & (~3));
+
+		SolverContactFriction* PX_RESTRICT frictions = reinterpret_cast<SolverContactFriction*>(currPtr);
+		currPtr += numFrictionConstr * sizeof(SolverContactFriction);
+
+		const FloatV invMassA = FLoad(hdr->invMass0);
+		const FloatV invMassB = FLoad(hdr->invMass1);
+
+		const FloatV angDom0 = FLoad(hdr->angDom0);
+		const FloatV angDom1 = FLoad(hdr->angDom1);
+
+		const Vec3V contactNormal = hdr->normal;
+
+		const FloatV accumulatedNormalImpulse = solveDynamicContacts(contacts, numNormalConstr, contactNormal, invMassA, invMassB, 
+			angDom0, angDom1, linVel0, angState0, linVel1, angState1, forceBuffer); 
+
+		if(cache.doFriction && numFrictionConstr)
+		{
+			const FloatV staticFrictionCof = hdr->getStaticFriction();
+			const FloatV dynamicFrictionCof = hdr->getDynamicFriction();
+			const FloatV maxFrictionImpulse = FMul(staticFrictionCof, accumulatedNormalImpulse);
+			const FloatV maxDynFrictionImpulse = FMul(dynamicFrictionCof, accumulatedNormalImpulse);
+			const FloatV negMaxDynFrictionImpulse = FNeg(maxDynFrictionImpulse);
+
+			BoolV broken = BFFFF();
+
+			if(cache.writeBackIteration)
+				Ps::prefetchLine(hdr->frictionBrokenWritebackByte);
+
+			for(PxU32 i=0;i<numFrictionConstr;i++)
+			{
+				SolverContactFriction& f = frictions[i];
+				Ps::prefetchLine(&frictions[i],128);
+
+
+				const Vec4V normalXYZ_appliedForceW = f.normalXYZ_appliedForceW;
+				const Vec4V raXnXYZ_velMultiplierW = f.raXnXYZ_velMultiplierW;
+				const Vec4V rbXnXYZ_biasW = f.rbXnXYZ_biasW;
+
+				const Vec3V normal = Vec3V_From_Vec4V(normalXYZ_appliedForceW);
+				const Vec3V raXn = Vec3V_From_Vec4V(raXnXYZ_velMultiplierW);
+				const Vec3V rbXn = Vec3V_From_Vec4V(rbXnXYZ_biasW);
+
+				const FloatV appliedForce = V4GetW(normalXYZ_appliedForceW);
+				const FloatV bias = V4GetW(rbXnXYZ_biasW);
+				const FloatV velMultiplier = V4GetW(raXnXYZ_velMultiplierW);
+				
+				const FloatV targetVel = FLoad(f.targetVel);
+
+				const Vec3V delLinVel0 = V3Scale(normal, invMassA);
+				const Vec3V delLinVel1 = V3Scale(normal, invMassB);
+
+				const Vec3V v0 = V3MulAdd(linVel0, normal, V3Mul(angState0, raXn));
+				const Vec3V v1 = V3MulAdd(linVel1, normal, V3Mul(angState1, rbXn));
+				const FloatV normalVel = V3SumElems(V3Sub(v0, v1));
+
+
+
+				// appliedForce -bias * velMultiplier - a hoisted part of the total impulse computation
+				const FloatV tmp1 = FNegScaleSub(FSub(bias, targetVel),velMultiplier,appliedForce);				
+
+				// Algorithm:
+				// if abs(appliedForce + deltaF) > maxFrictionImpulse
+				//    clamp newAppliedForce + deltaF to [-maxDynFrictionImpulse, maxDynFrictionImpulse]
+				//      (i.e. clamp deltaF to [-maxDynFrictionImpulse-appliedForce, maxDynFrictionImpulse-appliedForce]
+				//    set broken flag to true || broken flag
+
+				// FloatV deltaF = FMul(FAdd(bias, normalVel), minusVelMultiplier);
+				// FloatV potentialSumF = FAdd(appliedForce, deltaF);
+
+				const FloatV totalImpulse = FNegScaleSub(normalVel, velMultiplier, tmp1);
+
+				// On XBox this clamping code uses the vector simple pipe rather than vector float,
+				// which eliminates a lot of stall cycles
+
+				const BoolV clamp = FIsGrtr(FAbs(totalImpulse), maxFrictionImpulse);
+				
+				const FloatV totalClamped = FMin(maxDynFrictionImpulse, FMax(negMaxDynFrictionImpulse, totalImpulse));
+
+				const FloatV newAppliedForce = FSel(clamp, totalClamped,totalImpulse);
+
+				broken = BOr(broken, clamp);
+
+				FloatV deltaF = FSub(newAppliedForce, appliedForce);
+
+				// we could get rid of the stall here by calculating and clamping delta separately, but
+				// the complexity isn't really worth it.
+
+				linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+				linVel1 = V3NegScaleSub(delLinVel1, deltaF, linVel1);
+				angState0 = V3ScaleAdd(raXn, FMul(deltaF, angDom0), angState0);
+				angState1 = V3NegScaleSub(rbXn, FMul(deltaF, angDom1), angState1);
+
+				f.setAppliedForce(newAppliedForce);
+
+				
+			}
+			Store_From_BoolV(broken, &hdr->broken);
+		}
+
+	}
+
+	PX_ASSERT(b0.linearVelocity.isFinite());
+	PX_ASSERT(b0.angularState.isFinite());
+	PX_ASSERT(b1.linearVelocity.isFinite());
+	PX_ASSERT(b1.angularState.isFinite());
+
+	// Write back
+	V3StoreU(linVel0, b0.linearVelocity);
+	V3StoreU(linVel1, b1.linearVelocity);
+	V3StoreU(angState0, b0.angularState);
+	V3StoreU(angState1, b1.angularState);
+
+	PX_ASSERT(b0.linearVelocity.isFinite());
+	PX_ASSERT(b0.angularState.isFinite());
+	PX_ASSERT(b1.linearVelocity.isFinite());
+	PX_ASSERT(b1.angularState.isFinite());
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveContact_BStatic(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+	//PxSolverBody& b1 = *desc.bodyB;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+
+	const PxU8* PX_RESTRICT last = desc.constraint + getConstraintLength(desc);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	while(currPtr < last)
+	{
+		SolverContactHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeader*>(currPtr);
+		currPtr += sizeof(SolverContactHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		SolverContactPoint* PX_RESTRICT contacts = reinterpret_cast<SolverContactPoint*>(currPtr);
+		//Ps::prefetchLine(contacts);
+		currPtr += numNormalConstr * sizeof(SolverContactPoint);
+
+		PxF32* forceBuffer = reinterpret_cast<PxF32*>(currPtr);
+		currPtr += sizeof(PxF32) * ((numNormalConstr + 3) & (~3));
+
+		SolverContactFriction* PX_RESTRICT frictions = reinterpret_cast<SolverContactFriction*>(currPtr);
+		currPtr += numFrictionConstr * sizeof(SolverContactFriction);
+
+		
+
+		const FloatV invMassA = FLoad(hdr->invMass0);
+
+		const Vec3V contactNormal = hdr->normal;
+		const FloatV angDom0 = FLoad(hdr->angDom0);
+
+
+		const FloatV accumulatedNormalImpulse = solveStaticContacts(contacts, numNormalConstr, contactNormal,
+			invMassA, angDom0, linVel0, angState0, forceBuffer);
+
+		if(cache.doFriction && numFrictionConstr)
+		{
+			const FloatV maxFrictionImpulse = FMul(hdr->getStaticFriction(), accumulatedNormalImpulse);
+			const FloatV maxDynFrictionImpulse = FMul(hdr->getDynamicFriction(), accumulatedNormalImpulse);
+
+			BoolV broken = BFFFF();
+			if(cache.writeBackIteration)
+				Ps::prefetchLine(hdr->frictionBrokenWritebackByte);
+
+			for(PxU32 i=0;i<numFrictionConstr;i++)
+			{
+				SolverContactFriction& f = frictions[i];
+				Ps::prefetchLine(&frictions[i],128);
+				
+
+				const Vec4V normalXYZ_appliedForceW = f.normalXYZ_appliedForceW;
+				const Vec4V raXnXYZ_velMultiplierW = f.raXnXYZ_velMultiplierW;
+				const Vec4V rbXnXYZ_biasW = f.rbXnXYZ_biasW;
+
+				const Vec3V normal = Vec3V_From_Vec4V(normalXYZ_appliedForceW);
+				const Vec3V raXn = Vec3V_From_Vec4V(raXnXYZ_velMultiplierW);
+
+				const FloatV appliedForce = V4GetW(normalXYZ_appliedForceW);
+				const FloatV bias = V4GetW(rbXnXYZ_biasW);
+				const FloatV velMultiplier = V4GetW(raXnXYZ_velMultiplierW);
+
+				const FloatV targetVel = FLoad(f.targetVel);
+	
+				const FloatV negMaxDynFrictionImpulse = FNeg(maxDynFrictionImpulse);
+
+				const Vec3V delLinVel0 = V3Scale(normal, invMassA);
+				//const FloatV negMaxFrictionImpulse = FNeg(maxFrictionImpulse);
+
+				const Vec3V v0 = V3MulAdd(linVel0, normal, V3Mul(angState0, raXn));
+				const FloatV normalVel = V3SumElems(v0);
+
+
+				// appliedForce -bias * velMultiplier - a hoisted part of the total impulse computation
+				const FloatV tmp1 = FNegScaleSub(FSub(bias, targetVel),velMultiplier,appliedForce); 
+
+				// Algorithm:
+				// if abs(appliedForce + deltaF) > maxFrictionImpulse
+				//    clamp newAppliedForce + deltaF to [-maxDynFrictionImpulse, maxDynFrictionImpulse]
+				//      (i.e. clamp deltaF to [-maxDynFrictionImpulse-appliedForce, maxDynFrictionImpulse-appliedForce]
+				//    set broken flag to true || broken flag
+
+				// FloatV deltaF = FMul(FAdd(bias, normalVel), minusVelMultiplier);
+				// FloatV potentialSumF = FAdd(appliedForce, deltaF);
+
+				const FloatV totalImpulse = FNegScaleSub(normalVel, velMultiplier, tmp1);
+
+				// On XBox this clamping code uses the vector simple pipe rather than vector float,
+				// which eliminates a lot of stall cycles
+
+				const BoolV clamp = FIsGrtr(FAbs(totalImpulse), maxFrictionImpulse);
+				
+				const FloatV totalClamped = FMin(maxDynFrictionImpulse, FMax(negMaxDynFrictionImpulse, totalImpulse));
+				
+				broken = BOr(broken, clamp);
+
+				const FloatV newAppliedForce = FSel(clamp, totalClamped,totalImpulse);
+
+				FloatV deltaF = FSub(newAppliedForce, appliedForce);
+
+				// we could get rid of the stall here by calculating and clamping delta separately, but
+				// the complexity isn't really worth it.
+
+				linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+				angState0 = V3ScaleAdd(raXn, FMul(deltaF, angDom0), angState0);
+
+				f.setAppliedForce(newAppliedForce);
+
+			}
+			Store_From_BoolV(broken, &hdr->broken);
+		}
+
+	}
+
+	PX_ASSERT(b0.linearVelocity.isFinite());
+	PX_ASSERT(b0.angularState.isFinite());
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+
+	PX_ASSERT(b0.linearVelocity.isFinite());
+	PX_ASSERT(b0.angularState.isFinite());
+
+	PX_ASSERT(currPtr == last);
+}
+
+
+void concludeContact(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxU8* PX_RESTRICT cPtr = desc.constraint;
+
+	const FloatV zero = FZero();
+
+	PxU8* PX_RESTRICT last = desc.constraint + getConstraintLength(desc);
+	while(cPtr < last)
+	{
+		const SolverContactHeader* PX_RESTRICT hdr = reinterpret_cast<const SolverContactHeader*>(cPtr);
+		cPtr += sizeof(SolverContactHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		//if(cPtr < last)
+		//Ps::prefetchLine(cPtr, 512);
+		Ps::prefetchLine(cPtr,128);
+		Ps::prefetchLine(cPtr,256);
+		Ps::prefetchLine(cPtr,384);
+
+		const PxU32 pointStride = hdr->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactPointExt)
+																	   : sizeof(SolverContactPoint);
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContactPoint *c = reinterpret_cast<SolverContactPoint*>(cPtr);
+			cPtr += pointStride;
+			//c->scaledBias = PxMin(c->scaledBias, 0.f);
+			c->biasedErr = c->unbiasedErr;
+		}
+
+		cPtr += sizeof(PxF32) * ((numNormalConstr + 3) & (~3)); //Jump over force buffers
+
+		const PxU32 frictionStride = hdr->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactFrictionExt)
+																		  : sizeof(SolverContactFriction);
+		for(PxU32 i=0;i<numFrictionConstr;i++)
+		{
+			SolverContactFriction *f = reinterpret_cast<SolverContactFriction*>(cPtr);
+			cPtr += frictionStride;
+			f->setBias(zero);
+		}
+	}
+	PX_ASSERT(cPtr == last);
+}
+
+void writeBackContact(const PxSolverConstraintDesc& desc, SolverContext& cache,
+					  PxSolverBodyData& bd0, PxSolverBodyData& bd1)
+{
+
+	PxReal normalForce = 0;
+
+	PxU8* PX_RESTRICT cPtr = desc.constraint;
+	PxReal* PX_RESTRICT vForceWriteback = reinterpret_cast<PxReal*>(desc.writeBack);
+	PxU8* PX_RESTRICT last = desc.constraint + getConstraintLength(desc);
+
+	bool forceThreshold = false;
+
+	while(cPtr < last)
+	{
+		const SolverContactHeader* PX_RESTRICT hdr = reinterpret_cast<const SolverContactHeader*>(cPtr);
+		cPtr += sizeof(SolverContactHeader);
+
+		forceThreshold = hdr->flags & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		//if(cPtr < last)
+		Ps::prefetchLine(cPtr, 256);
+		Ps::prefetchLine(cPtr, 384);
+
+		const PxU32 pointStride = hdr->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactPointExt)
+																	   : sizeof(SolverContactPoint);
+
+		cPtr += pointStride * numNormalConstr;
+		PxF32* forceBuffer = reinterpret_cast<PxF32*>(cPtr);
+		cPtr += sizeof(PxF32) * ((numNormalConstr + 3) & (~3));
+
+		if(vForceWriteback!=NULL)
+		{
+			for(PxU32 i=0; i<numNormalConstr; i++)
+			{
+				PxReal appliedForce = forceBuffer[i];
+				*vForceWriteback++ = appliedForce;
+				normalForce += appliedForce;
+			}
+		}
+
+		const PxU32 frictionStride = hdr->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactFrictionExt)
+																		  : sizeof(SolverContactFriction);
+
+		if(hdr->broken && hdr->frictionBrokenWritebackByte != NULL)
+		{
+			*hdr->frictionBrokenWritebackByte = 1;
+		}
+
+		cPtr += frictionStride * numFrictionConstr;
+
+	}
+	PX_ASSERT(cPtr == last);
+
+	
+
+	if(forceThreshold && desc.linkIndexA == PxSolverConstraintDesc::NO_LINK && desc.linkIndexB == PxSolverConstraintDesc::NO_LINK &&
+		normalForce !=0 && (bd0.reportThreshold < PX_MAX_REAL  || bd1.reportThreshold < PX_MAX_REAL))
+	{
+		ThresholdStreamElement elt;
+		elt.normalForce = normalForce;
+		elt.threshold = PxMin<float>(bd0.reportThreshold, bd1.reportThreshold);
+		elt.nodeIndexA = bd0.nodeIndex;
+		elt.nodeIndexB = bd1.nodeIndex;
+		elt.shapeInteraction  = reinterpret_cast<const SolverContactHeader*>(desc.constraint)->shapeInteraction;
+		Ps::order(elt.nodeIndexA, elt.nodeIndexB);
+		PX_ASSERT(elt.nodeIndexA < elt.nodeIndexB);
+		PX_ASSERT(cache.mThresholdStreamIndex<cache.mThresholdStreamLength);
+		cache.mThresholdStream[cache.mThresholdStreamIndex++] = elt;
+	}
+}
+
+// adjust from CoM to joint
+
+void writeBack1D(const PxSolverConstraintDesc& desc, SolverContext&, PxSolverBodyData&, PxSolverBodyData&)
+{
+	ConstraintWriteback* writeback = reinterpret_cast<ConstraintWriteback*>(desc.writeBack);
+	if(writeback)
+	{
+		SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
+		PxU8* base = desc.constraint + sizeof(SolverConstraint1DHeader);
+		PxU32 stride = header->type == DY_SC_TYPE_EXT_1D ? sizeof(SolverConstraint1DExt) : sizeof(SolverConstraint1D);
+
+		PxVec3 lin(0), ang(0);
+		for(PxU32 i=0; i<header->count; i++)
+		{
+			const SolverConstraint1D* c = reinterpret_cast<SolverConstraint1D*>(base);
+			if(c->flags & DY_SC_FLAG_OUTPUT_FORCE)
+			{
+				lin += c->lin0 * c->appliedForce;
+				ang += c->ang0Writeback * c->appliedForce;
+			}
+			base += stride;
+		}
+
+		ang -= header->body0WorldOffset.cross(lin);
+		writeback->linearImpulse = lin;
+		writeback->angularImpulse = ang;
+		writeback->broken = header->breakable ? PxU32(lin.magnitude()>header->linBreakImpulse || ang.magnitude()>header->angBreakImpulse) : 0;
+
+		PX_ASSERT(desc.constraint + getConstraintLength(desc) == base);
+	}
+}
+
+
+void solve1DBlock (const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		solve1D(desc[a-1], cache);
+	}
+	solve1D(desc[constraintCount-1], cache);
+}
+
+void solve1DConcludeBlock (const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		solve1D(desc[a-1], cache);
+		conclude1D(desc[a-1], cache);
+	}
+	solve1D(desc[constraintCount-1], cache);
+	conclude1D(desc[constraintCount-1], cache);
+}
+
+void solve1DBlockWriteBack (const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a-1].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a-1].bodyBDataIndex];
+		solve1D(desc[a-1], cache);
+		writeBack1D(desc[a-1], cache, bd0, bd1);
+	}
+	PxSolverBodyData& bd0 = cache.solverBodyArray[desc[constraintCount-1].bodyADataIndex];
+	PxSolverBodyData& bd1 = cache.solverBodyArray[desc[constraintCount-1].bodyBDataIndex];
+	solve1D(desc[constraintCount-1], cache);
+	writeBack1D(desc[constraintCount-1], cache, bd0, bd1);
+}
+
+void writeBack1DBlock (const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a-1].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a-1].bodyBDataIndex];
+		writeBack1D(desc[a-1], cache, bd0, bd1);
+	}
+	PxSolverBodyData& bd0 = cache.solverBodyArray[desc[constraintCount-1].bodyADataIndex];
+	PxSolverBodyData& bd1 = cache.solverBodyArray[desc[constraintCount-1].bodyBDataIndex];
+	writeBack1D(desc[constraintCount-1], cache, bd0, bd1);
+}
+
+void solveContactBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		solveContact(desc[a-1], cache);
+	}
+	solveContact(desc[constraintCount-1], cache);
+}
+
+void solveContactConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		solveContact(desc[a-1], cache);
+		concludeContact(desc[a-1], cache);
+	}
+	solveContact(desc[constraintCount-1], cache);
+	concludeContact(desc[constraintCount-1], cache);
+}
+
+void solveContactBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a-1].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a-1].bodyBDataIndex];
+		solveContact(desc[a-1], cache);
+		writeBackContact(desc[a-1], cache, bd0, bd1);
+	}
+	PxSolverBodyData& bd0 = cache.solverBodyArray[desc[constraintCount-1].bodyADataIndex];
+	PxSolverBodyData& bd1 = cache.solverBodyArray[desc[constraintCount-1].bodyBDataIndex];
+	solveContact(desc[constraintCount-1], cache);
+	writeBackContact(desc[constraintCount-1], cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveContact_BStaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		solveContact_BStatic(desc[a-1], cache);
+	}
+	solveContact_BStatic(desc[constraintCount-1], cache);
+}
+
+void solveContact_BStaticConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		solveContact_BStatic(desc[a-1], cache);
+		concludeContact(desc[a-1], cache);
+	}
+	solveContact_BStatic(desc[constraintCount-1], cache);
+	concludeContact(desc[constraintCount-1], cache);
+}
+
+void solveContact_BStaticBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 1; a < constraintCount; ++a)
+	{
+		Ps::prefetchLine(desc[a].constraint);
+		Ps::prefetchLine(desc[a].constraint, 128);
+		Ps::prefetchLine(desc[a].constraint, 256);
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a-1].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a-1].bodyBDataIndex];
+		solveContact_BStatic(desc[a-1], cache);
+		writeBackContact(desc[a-1], cache, bd0, bd1);
+	}
+	PxSolverBodyData& bd0 = cache.solverBodyArray[desc[constraintCount-1].bodyADataIndex];
+	PxSolverBodyData& bd1 = cache.solverBodyArray[desc[constraintCount-1].bodyBDataIndex];
+	solveContact_BStatic(desc[constraintCount-1], cache);
+	writeBackContact(desc[constraintCount-1], cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Not enough space to write 4 more thresholds back!
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+//Port of scalar implementation to SIMD maths with some interleaving of instructions
+void solveExt1D(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxU8* PX_RESTRICT bPtr = desc.constraint;
+	//PxU32 length = desc.constraintLength;
+
+	const SolverConstraint1DHeader* PX_RESTRICT  header = reinterpret_cast<const SolverConstraint1DHeader*>(bPtr);
+	SolverConstraint1DExt* PX_RESTRICT base = reinterpret_cast<SolverConstraint1DExt*>(bPtr + sizeof(SolverConstraint1DHeader));
+
+	Vec3V linVel0, angVel0, linVel1, angVel1;
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel0 = V3LoadA(desc.bodyA->linearVelocity);
+		angVel0 = V3LoadA(desc.bodyA->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationA, desc.linkIndexA);
+		linVel0 = v.linear;
+		angVel0 = v.angular;
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel1 = V3LoadA(desc.bodyB->linearVelocity);
+		angVel1 = V3LoadA(desc.bodyB->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationB, desc.linkIndexB);
+		linVel1 = v.linear;
+		angVel1 = v.angular;
+	}
+
+	Vec3V li0 = V3Zero(), li1 = V3Zero(), ai0 = V3Zero(), ai1 = V3Zero();
+
+	for(PxU32 i=0; i<header->count;++i, base++)
+	{
+		Ps::prefetchLine(base+1);
+
+		const Vec4V lin0XYZ_constantW						= V4LoadA(&base->lin0.x);	
+		const Vec4V lin1XYZ_unbiasedConstantW				= V4LoadA(&base->lin1.x);
+		const Vec4V ang0XYZ_velMultiplierW					= V4LoadA(&base->ang0.x);
+		const Vec4V ang1XYZ_impulseMultiplierW				= V4LoadA(&base->ang1.x);	
+		const Vec4V minImpulseX_maxImpulseY_appliedForceZ	= V4LoadA(&base->minImpulse);
+
+		const Vec3V lin0 = Vec3V_From_Vec4V(lin0XYZ_constantW);				FloatV constant = V4GetW(lin0XYZ_constantW);
+		const Vec3V lin1 = Vec3V_From_Vec4V(lin1XYZ_unbiasedConstantW);
+		const Vec3V ang0 = Vec3V_From_Vec4V(ang0XYZ_velMultiplierW);		FloatV vMul = V4GetW(ang0XYZ_velMultiplierW);
+		const Vec3V ang1 = Vec3V_From_Vec4V(ang1XYZ_impulseMultiplierW);	FloatV iMul = V4GetW(ang1XYZ_impulseMultiplierW);
+
+		const FloatV minImpulse		= V4GetX(minImpulseX_maxImpulseY_appliedForceZ);
+		const FloatV maxImpulse		= V4GetY(minImpulseX_maxImpulseY_appliedForceZ);
+		const FloatV appliedForce	= V4GetZ(minImpulseX_maxImpulseY_appliedForceZ);
+
+		const Vec3V v0 = V3MulAdd(linVel0, lin0, V3Mul(angVel0, ang0));
+		const Vec3V v1 = V3MulAdd(linVel1, lin1, V3Mul(angVel1, ang1));
+		const FloatV normalVel = V3SumElems(V3Sub(v0, v1));
+
+		const FloatV unclampedForce = FScaleAdd(iMul, appliedForce, FScaleAdd(vMul, normalVel, constant));
+		const FloatV clampedForce = FMin(maxImpulse, (FMax(minImpulse, unclampedForce)));
+		const FloatV deltaF = FSub(clampedForce, appliedForce);
+
+		FStore(clampedForce, &base->appliedForce);
+		li0 = V3ScaleAdd(lin0, deltaF, li0);	ai0 = V3ScaleAdd(ang0, deltaF, ai0);
+		li1 = V3ScaleAdd(lin1, deltaF, li1);	ai1 = V3ScaleAdd(ang1, deltaF, ai1);
+
+		linVel0 = V3ScaleAdd(base->deltaVA.linear, deltaF, linVel0); 		angVel0 = V3ScaleAdd(base->deltaVA.angular, deltaF, angVel0);
+		linVel1 = V3ScaleAdd(base->deltaVB.linear, deltaF, linVel1); 		angVel1 = V3ScaleAdd(base->deltaVB.angular, deltaF, angVel1);
+	}
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel0, desc.bodyA->linearVelocity);
+		V3StoreA(angVel0, desc.bodyA->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationA, desc.linkIndexA, V3Scale(li0, FLoad(header->linearInvMassScale0)),
+																V3Scale(ai0, FLoad(header->angularInvMassScale0)));
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel1, desc.bodyB->linearVelocity);
+		V3StoreA(angVel1, desc.bodyB->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationB, desc.linkIndexB, V3Scale(li1, FLoad(header->linearInvMassScale1)), 
+																V3Scale(ai1, FLoad(header->angularInvMassScale1)));
+}
+
+void solveExtContact(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	Vec3V linVel0, angVel0, linVel1, angVel1;
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel0 = V3LoadA(desc.bodyA->linearVelocity);
+		angVel0 = V3LoadA(desc.bodyA->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationA, desc.linkIndexA);
+		linVel0 = v.linear;
+		angVel0 = v.angular;
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel1 = V3LoadA(desc.bodyB->linearVelocity);
+		angVel1 = V3LoadA(desc.bodyB->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationB, desc.linkIndexB);
+		linVel1 = v.linear;
+		angVel1 = v.angular;
+	}
+
+	const PxU8* PX_RESTRICT last = desc.constraint + desc.constraintLengthOver16*16;
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	Vec3V linImpulse0 = V3Zero(), linImpulse1 = V3Zero(), angImpulse0 = V3Zero(), angImpulse1 = V3Zero();
+
+	while(currPtr < last)
+	{
+		SolverContactHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeader*>(currPtr);
+		currPtr += sizeof(SolverContactHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		SolverContactPointExt* PX_RESTRICT contacts = reinterpret_cast<SolverContactPointExt*>(currPtr);
+		Ps::prefetchLine(contacts);
+		currPtr += numNormalConstr * sizeof(SolverContactPointExt);
+
+		PxF32* appliedForceBuffer = reinterpret_cast<PxF32*>(currPtr);
+		currPtr += sizeof(PxF32) * ((numNormalConstr + 3) & (~3));
+
+		SolverContactFrictionExt* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionExt*>(currPtr);
+		currPtr += numFrictionConstr * sizeof(SolverContactFrictionExt);
+
+		
+
+		Vec3V li0 = V3Zero(), li1 = V3Zero(), ai0 = V3Zero(), ai1 = V3Zero();
+
+		const Vec3V contactNormal = hdr->normal;
+
+		const FloatV accumulatedNormalImpulse = solveExtContacts(contacts, numNormalConstr, contactNormal, linVel0, angVel0, linVel1, 
+			angVel1, li0, ai0, li1, ai1, appliedForceBuffer);
+
+
+		if(cache.doFriction && numFrictionConstr)
+		{
+			Ps::prefetchLine(frictions);
+			const FloatV maxFrictionImpulse = FMul(hdr->getStaticFriction(), accumulatedNormalImpulse);
+			const FloatV maxDynFrictionImpulse = FMul(hdr->getDynamicFriction(), accumulatedNormalImpulse);
+
+			BoolV broken = BFFFF();
+
+			for(PxU32 i=0;i<numFrictionConstr;i++)
+			{
+				SolverContactFrictionExt& f = frictions[i];
+				Ps::prefetchLine(&frictions[i+1]);
+
+				const Vec4V normalXYZ_appliedForceW = f.normalXYZ_appliedForceW;
+				const Vec4V raXnXYZ_velMultiplierW = f.raXnXYZ_velMultiplierW;
+				const Vec4V rbXnXYZ_biasW = f.rbXnXYZ_biasW;
+
+				const Vec3V normal = Vec3V_From_Vec4V(normalXYZ_appliedForceW);
+				/*const Vec3V normal0 = V3Scale(normal, sqrtInvMass0);
+				const Vec3V normal1 = V3Scale(normal, sqrtInvMass1);*/
+				const Vec3V raXn = Vec3V_From_Vec4V(raXnXYZ_velMultiplierW);
+				const Vec3V rbXn = Vec3V_From_Vec4V(rbXnXYZ_biasW);
+
+				const FloatV appliedForce = V4GetW(normalXYZ_appliedForceW);
+				const FloatV bias = V4GetW(rbXnXYZ_biasW);
+				const FloatV velMultiplier = V4GetW(raXnXYZ_velMultiplierW);
+
+				const FloatV targetVel = FLoad(f.targetVel);
+
+				const FloatV negMaxDynFrictionImpulse = FNeg(maxDynFrictionImpulse);
+				const FloatV negMaxFrictionImpulse = FNeg(maxFrictionImpulse);
+
+				const Vec3V v0 = V3MulAdd(linVel0, normal, V3Mul(angVel0, raXn));
+				const Vec3V v1 = V3MulAdd(linVel1, normal, V3Mul(angVel1, rbXn));
+				const FloatV normalVel = V3SumElems(V3Sub(v0, v1));
+
+				// appliedForce -bias * velMultiplier - a hoisted part of the total impulse computation
+				const FloatV tmp1 = FNegScaleSub(FSub(bias, targetVel),velMultiplier,appliedForce); 
+
+				// Algorithm:
+				// if abs(appliedForce + deltaF) > maxFrictionImpulse
+				//    clamp newAppliedForce + deltaF to [-maxDynFrictionImpulse, maxDynFrictionImpulse]
+				//      (i.e. clamp deltaF to [-maxDynFrictionImpulse-appliedForce, maxDynFrictionImpulse-appliedForce]
+				//    set broken flag to true || broken flag
+
+				// FloatV deltaF = FMul(FAdd(bias, normalVel), minusVelMultiplier);
+				// FloatV potentialSumF = FAdd(appliedForce, deltaF);
+
+				const FloatV totalImpulse = FNegScaleSub(normalVel, velMultiplier, tmp1);
+
+				// On XBox this clamping code uses the vector simple pipe rather than vector float,
+				// which eliminates a lot of stall cycles
+
+				const BoolV clampLow = FIsGrtr(negMaxFrictionImpulse, totalImpulse);
+				const BoolV clampHigh = FIsGrtr(totalImpulse, maxFrictionImpulse);
+
+				const FloatV totalClampedLow = FMax(negMaxDynFrictionImpulse, totalImpulse);
+				const FloatV totalClampedHigh = FMin(maxDynFrictionImpulse, totalImpulse);
+
+				const FloatV newAppliedForce = FSel(clampLow, totalClampedLow,
+															  FSel(clampHigh, totalClampedHigh, totalImpulse));
+
+				broken = BOr(broken, BOr(clampLow, clampHigh));
+
+				FloatV deltaF = FSub(newAppliedForce, appliedForce);
+
+				linVel0 = V3ScaleAdd(f.linDeltaVA, deltaF, linVel0);	
+				angVel0 = V3ScaleAdd(f.angDeltaVA, deltaF, angVel0);
+				linVel1 = V3ScaleAdd(f.linDeltaVB, deltaF, linVel1);	
+				angVel1 = V3ScaleAdd(f.angDeltaVB, deltaF, angVel1);
+
+				li0 = V3ScaleAdd(normal, deltaF, li0);	ai0 = V3ScaleAdd(raXn, deltaF, ai0);
+				li1 = V3ScaleAdd(normal, deltaF, li1);	ai1 = V3ScaleAdd(rbXn, deltaF, ai1);
+
+				f.setAppliedForce(newAppliedForce);
+			}
+			Store_From_BoolV(broken, &hdr->broken);
+		}
+
+		linImpulse0 = V3ScaleAdd(li0, hdr->getDominance0(), linImpulse0);		
+		angImpulse0 = V3ScaleAdd(ai0, FLoad(hdr->angDom0), angImpulse0);
+		linImpulse1 = V3NegScaleSub(li1, hdr->getDominance1(), linImpulse1);	
+		angImpulse1 = V3NegScaleSub(ai1, FLoad(hdr->angDom1), angImpulse1);
+	}
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel0, desc.bodyA->linearVelocity);
+		V3StoreA(angVel0, desc.bodyA->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationA, desc.linkIndexA, linImpulse0, angImpulse0);
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel1, desc.bodyB->linearVelocity);
+		V3StoreA(angVel1, desc.bodyB->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationB, desc.linkIndexB, linImpulse1, angImpulse1);
+
+	PX_ASSERT(currPtr == last);
+}
+
+
+void solveExtContactBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtContact(desc[a], cache);
+	}
+}
+
+void solveExtContactConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtContact(desc[a], cache);
+		concludeContact(desc[a], cache);
+	}
+}
+
+void solveExtContactBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].linkIndexA != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].linkIndexB != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyBDataIndex];
+
+		solveExtContact(desc[a], cache);
+		writeBackContact(desc[a], cache, bd0, bd1);
+	}
+	if(cache.mThresholdStreamIndex > 0)
+	{
+		//Not enough space to write 4 more thresholds back!
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveExt1DBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExt1D(desc[a], cache);
+	}
+}
+
+void solveExt1DConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExt1D(desc[a], cache);
+		conclude1D(desc[a], cache);
+	}
+}
+
+void solveExt1DBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].linkIndexA != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].linkIndexB != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyBDataIndex];
+		solveExt1D(desc[a], cache);
+		writeBack1D(desc[a], cache, bd0, bd1);
+	}
+}
+
+void ext1DBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].linkIndexA != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].linkIndexB != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyBDataIndex];
+		writeBack1D(desc[a], cache, bd0, bd1);
+	}
+}
+
+void solveConcludeExtContact		(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveExtContact(desc, cache);
+	concludeContact(desc, cache);
+}
+
+void solveConcludeExt1D				(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveExt1D(desc, cache);
+	conclude1D(desc, cache);
+}
+
+
+void solveConclude1D(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solve1D(desc, cache);
+	conclude1D(desc, cache);
+}
+
+void solveConcludeContact			(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveContact(desc, cache);
+	concludeContact(desc, cache);
+}
+
+void solveConcludeContact_BStatic	(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveContact_BStatic(desc, cache);
+	concludeContact(desc, cache);
+}
+
+
+}
+
+}
+
+#endif
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintsBlock.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintsBlock.cpp
new file mode 100644
index 00000000..aa06dfcf
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintsBlock.cpp
@@ -0,0 +1,1230 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "PsFPU.h"
+
+#ifdef PX_SUPPORT_SIMD
+
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContact.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "DyThresholdTable.h"
+#include "DySolverContext.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+#include "DySolverContact4.h"
+#include "DySolverConstraint1D4.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+static void solveContact4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& cache)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b01 = *desc[0].bodyB;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b11 = *desc[1].bodyB;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b21 = *desc[2].bodyB;
+	PxSolverBody& b30 = *desc[3].bodyA;
+	PxSolverBody& b31 = *desc[3].bodyB;
+
+	//We'll need this.
+	const Vec4V vZero	= V4Zero();	
+	
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+	Vec4V angState01 = V4LoadA(&b01.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+	Vec4V angState11 = V4LoadA(&b11.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+	Vec4V angState21 = V4LoadA(&b21.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+	Vec4V angState31 = V4LoadA(&b31.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V linVel1T0, linVel1T1, linVel1T2, linVel1T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+	Vec4V angState1T0, angState1T1, angState1T2, angState1T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2, linVel1T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	PX_TRANSPOSE_44(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2, angState1T3);
+
+
+	const PxU8* PX_RESTRICT last = desc[0].constraint + getConstraintLength(desc[0]);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+
+	Vec4V vMax = V4Splat(FMax());
+
+	const PxU8* PX_RESTRICT prefetchAddress = currPtr + sizeof(SolverContactHeader4) + sizeof(SolverContactBatchPointDynamic4);
+
+	const SolverContactHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeader4*>(currPtr);
+
+	const Vec4V invMassA = hdr->invMass0D0;
+	const Vec4V invMassB = hdr->invMass1D1;
+
+	const Vec4V sumInvMass = V4Add(invMassA, invMassB);
+
+
+	while(currPtr < last)
+	{
+
+		hdr = reinterpret_cast<const SolverContactHeader4*>(currPtr);
+
+		PX_ASSERT(hdr->type == DY_SC_TYPE_BLOCK_RB_CONTACT);
+
+		currPtr = reinterpret_cast<PxU8*>(const_cast<SolverContactHeader4*>(hdr) + 1);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		bool hasMaxImpulse = (hdr->flag & SolverContactHeader4::eHAS_MAX_IMPULSE) != 0;
+
+		Vec4V* appliedForces = reinterpret_cast<Vec4V*>(currPtr);
+		currPtr += sizeof(Vec4V)*numNormalConstr;
+
+		SolverContactBatchPointDynamic4* PX_RESTRICT contacts = reinterpret_cast<SolverContactBatchPointDynamic4*>(currPtr);
+
+		Vec4V* maxImpulses;
+		currPtr = reinterpret_cast<PxU8*>(contacts + numNormalConstr);
+		PxU32 maxImpulseMask = 0;
+		if(hasMaxImpulse)
+		{
+			maxImpulseMask = 0xFFFFFFFF;
+			maxImpulses = reinterpret_cast<Vec4V*>(currPtr);
+			currPtr += sizeof(Vec4V) * numNormalConstr;
+		}
+		else
+		{
+			maxImpulses = &vMax;
+		}
+
+				
+		SolverFrictionSharedData4* PX_RESTRICT fd = reinterpret_cast<SolverFrictionSharedData4*>(currPtr);
+		if(numFrictionConstr)
+			currPtr += sizeof(SolverFrictionSharedData4);
+
+		Vec4V* frictionAppliedForce = reinterpret_cast<Vec4V*>(currPtr);
+		currPtr += sizeof(Vec4V)*numFrictionConstr;
+
+		const SolverContactFrictionDynamic4* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionDynamic4*>(currPtr);
+		currPtr += numFrictionConstr * sizeof(SolverContactFrictionDynamic4);
+		
+		Vec4V accumulatedNormalImpulse = vZero;
+
+		const Vec4V angD0 = hdr->angDom0;
+		const Vec4V angD1 = hdr->angDom1;
+
+		const Vec4V _normalT0 = hdr->normalX;
+		const Vec4V _normalT1 = hdr->normalY;
+		const Vec4V _normalT2 = hdr->normalZ;
+
+		Vec4V contactNormalVel1 = V4Mul(linVel0T0, _normalT0);
+		Vec4V contactNormalVel3 = V4Mul(linVel1T0, _normalT0);
+		contactNormalVel1 = V4MulAdd(linVel0T1, _normalT1, contactNormalVel1);
+		contactNormalVel3 = V4MulAdd(linVel1T1, _normalT1, contactNormalVel3);
+		contactNormalVel1 = V4MulAdd(linVel0T2, _normalT2, contactNormalVel1);
+		contactNormalVel3 = V4MulAdd(linVel1T2, _normalT2, contactNormalVel3);
+
+		Vec4V relVel1 = V4Sub(contactNormalVel1, contactNormalVel3);
+
+		Vec4V accumDeltaF = vZero;
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			const SolverContactBatchPointDynamic4& c = contacts[i];
+
+			PxU32 offset = 0;
+			Ps::prefetchLine(prefetchAddress, offset += 64);
+			Ps::prefetchLine(prefetchAddress, offset += 64);
+			Ps::prefetchLine(prefetchAddress, offset += 64);
+			prefetchAddress += offset;
+
+			const Vec4V appliedForce = appliedForces[i];
+			const Vec4V maxImpulse = maxImpulses[i & maxImpulseMask];			
+			
+			Vec4V contactNormalVel2 = V4Mul(c.raXnX, angState0T0);
+			Vec4V contactNormalVel4 = V4Mul(c.rbXnX, angState1T0);
+
+			contactNormalVel2 = V4MulAdd(c.raXnY, angState0T1, contactNormalVel2);
+			contactNormalVel4 = V4MulAdd(c.rbXnY, angState1T1, contactNormalVel4);
+
+			contactNormalVel2 = V4MulAdd(c.raXnZ, angState0T2, contactNormalVel2);
+			contactNormalVel4 = V4MulAdd(c.rbXnZ, angState1T2, contactNormalVel4);
+
+			const Vec4V normalVel = V4Add(relVel1, V4Sub(contactNormalVel2, contactNormalVel4));
+
+			Vec4V deltaF = V4NegMulSub(normalVel, c.velMultiplier, c.biasedErr);
+
+			deltaF = V4Max(deltaF,  V4Neg(appliedForce));
+			const Vec4V newAppliedForce = V4Min(V4Add(appliedForce, deltaF), maxImpulse);
+			deltaF = V4Sub(newAppliedForce, appliedForce);
+
+			accumDeltaF = V4Add(accumDeltaF, deltaF);
+
+			const Vec4V angDetaF0 = V4Mul(deltaF, angD0);
+			const Vec4V angDetaF1 = V4Mul(deltaF, angD1);
+
+			relVel1 = V4MulAdd(sumInvMass, deltaF, relVel1);
+			
+			angState0T0 = V4MulAdd(c.raXnX, angDetaF0, angState0T0);
+			angState1T0 = V4NegMulSub(c.rbXnX, angDetaF1, angState1T0);
+			
+			angState0T1 = V4MulAdd(c.raXnY, angDetaF0, angState0T1);
+			angState1T1 = V4NegMulSub(c.rbXnY, angDetaF1, angState1T1);
+
+			angState0T2 = V4MulAdd(c.raXnZ, angDetaF0, angState0T2);
+			angState1T2 = V4NegMulSub(c.rbXnZ, angDetaF1, angState1T2);
+
+			appliedForces[i] = newAppliedForce;
+
+			accumulatedNormalImpulse = V4Add(accumulatedNormalImpulse, newAppliedForce);
+		}
+
+		const Vec4V accumDeltaF_IM0 = V4Mul(accumDeltaF, invMassA);
+		const Vec4V accumDeltaF_IM1 = V4Mul(accumDeltaF, invMassB);
+
+		linVel0T0 = V4MulAdd(_normalT0, accumDeltaF_IM0, linVel0T0);
+		linVel1T0 = V4NegMulSub(_normalT0, accumDeltaF_IM1, linVel1T0);
+		linVel0T1 = V4MulAdd(_normalT1, accumDeltaF_IM0, linVel0T1);
+		linVel1T1 = V4NegMulSub(_normalT1, accumDeltaF_IM1, linVel1T1);
+		linVel0T2 = V4MulAdd(_normalT2, accumDeltaF_IM0, linVel0T2);
+		linVel1T2 = V4NegMulSub(_normalT2, accumDeltaF_IM1, linVel1T2);
+
+
+		if(cache.doFriction && numFrictionConstr)
+		{
+			const Vec4V staticFric = hdr->staticFriction;
+			const Vec4V dynamicFric = hdr->dynamicFriction;
+
+			const Vec4V maxFrictionImpulse = V4Mul(staticFric, accumulatedNormalImpulse);
+			const Vec4V maxDynFrictionImpulse = V4Mul(dynamicFric, accumulatedNormalImpulse);
+			const Vec4V negMaxDynFrictionImpulse = V4Neg(maxDynFrictionImpulse);
+			//const Vec4V negMaxFrictionImpulse = V4Neg(maxFrictionImpulse);
+			BoolV broken = BFFFF();
+
+			if(cache.writeBackIteration)
+			{
+				Ps::prefetchLine(fd->frictionBrokenWritebackByte[0]);
+				Ps::prefetchLine(fd->frictionBrokenWritebackByte[1]);
+				Ps::prefetchLine(fd->frictionBrokenWritebackByte[2]);
+			}
+
+
+			for(PxU32 i=0;i<numFrictionConstr;i++)
+			{
+				const SolverContactFrictionDynamic4& f = frictions[i];
+
+				PxU32 offset = 0;
+				Ps::prefetchLine(prefetchAddress, offset += 64);
+				Ps::prefetchLine(prefetchAddress, offset += 64);
+				Ps::prefetchLine(prefetchAddress, offset += 64);
+				Ps::prefetchLine(prefetchAddress, offset += 64);
+				prefetchAddress += offset;
+
+				const Vec4V appliedForce = frictionAppliedForce[i];
+
+				const Vec4V normalT0 = fd->normalX[i&1];
+				const Vec4V normalT1 = fd->normalY[i&1];
+				const Vec4V normalT2 = fd->normalZ[i&1];
+
+				Vec4V normalVel1 = V4Mul(linVel0T0, normalT0);
+				Vec4V normalVel2 = V4Mul(f.raXnX, angState0T0);
+				Vec4V normalVel3 = V4Mul(linVel1T0, normalT0);
+				Vec4V normalVel4 = V4Mul(f.rbXnX, angState1T0);
+
+				normalVel1 = V4MulAdd(linVel0T1, normalT1, normalVel1);
+				normalVel2 = V4MulAdd(f.raXnY, angState0T1, normalVel2);
+				normalVel3 = V4MulAdd(linVel1T1, normalT1, normalVel3);
+				normalVel4 = V4MulAdd(f.rbXnY, angState1T1, normalVel4);
+
+				normalVel1 = V4MulAdd(linVel0T2, normalT2, normalVel1);
+				normalVel2 = V4MulAdd(f.raXnZ, angState0T2, normalVel2);
+				normalVel3 = V4MulAdd(linVel1T2, normalT2, normalVel3);
+				normalVel4 = V4MulAdd(f.rbXnZ, angState1T2, normalVel4);
+
+				const Vec4V _normalVel = V4Add(normalVel1, normalVel2);
+				const Vec4V __normalVel = V4Add(normalVel3, normalVel4);
+
+				// appliedForce -bias * velMultiplier - a hoisted part of the total impulse computation
+			
+				const Vec4V normalVel = V4Sub(_normalVel, __normalVel );
+
+				const Vec4V tmp1 = V4Sub(appliedForce, f.scaledBias); 
+
+				const Vec4V totalImpulse = V4NegMulSub(normalVel, f.velMultiplier, tmp1);
+				
+				broken = BOr(broken, V4IsGrtr(V4Abs(totalImpulse), maxFrictionImpulse));
+
+				const Vec4V newAppliedForce = V4Sel(broken, V4Min(maxDynFrictionImpulse, V4Max(negMaxDynFrictionImpulse, totalImpulse)), totalImpulse);
+
+				const Vec4V deltaF =V4Sub(newAppliedForce, appliedForce);
+
+				frictionAppliedForce[i] = newAppliedForce;
+
+				const Vec4V deltaFIM0 = V4Mul(deltaF, invMassA);
+				const Vec4V deltaFIM1 = V4Mul(deltaF, invMassB);
+
+				const Vec4V angDetaF0 = V4Mul(deltaF, angD0);
+				const Vec4V angDetaF1 = V4Mul(deltaF, angD1);
+
+				linVel0T0 = V4MulAdd(normalT0, deltaFIM0, linVel0T0);
+				linVel1T0 = V4NegMulSub(normalT0, deltaFIM1, linVel1T0);
+				angState0T0 = V4MulAdd(f.raXnX, angDetaF0, angState0T0);
+				angState1T0 = V4NegMulSub(f.rbXnX, angDetaF1, angState1T0);
+
+				linVel0T1 = V4MulAdd(normalT1, deltaFIM0, linVel0T1);
+				linVel1T1 = V4NegMulSub(normalT1, deltaFIM1, linVel1T1);
+				angState0T1 = V4MulAdd(f.raXnY, angDetaF0, angState0T1);
+				angState1T1 = V4NegMulSub(f.rbXnY, angDetaF1, angState1T1);
+
+				linVel0T2 = V4MulAdd(normalT2, deltaFIM0, linVel0T2);
+				linVel1T2 = V4NegMulSub(normalT2, deltaFIM1, linVel1T2);
+				angState0T2 = V4MulAdd(f.raXnZ, angDetaF0, angState0T2);
+				angState1T2 = V4NegMulSub(f.rbXnZ, angDetaF1, angState1T2);
+			}
+			fd->broken = broken;
+		}
+	}
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(linVel1T0, linVel1T1, linVel1T2, linVel1T3, linVel01, linVel11, linVel21, linVel31);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+	PX_TRANSPOSE_44(angState1T0, angState1T1, angState1T2, angState1T3, angState01, angState11, angState21, angState31);
+
+	PX_ASSERT(b00.linearVelocity.isFinite());
+	PX_ASSERT(b00.angularState.isFinite());
+	PX_ASSERT(b10.linearVelocity.isFinite());
+	PX_ASSERT(b10.angularState.isFinite());
+	PX_ASSERT(b20.linearVelocity.isFinite());
+	PX_ASSERT(b20.angularState.isFinite());
+	PX_ASSERT(b30.linearVelocity.isFinite());
+	PX_ASSERT(b30.angularState.isFinite());
+
+	PX_ASSERT(b01.linearVelocity.isFinite());
+	PX_ASSERT(b01.angularState.isFinite());
+	PX_ASSERT(b11.linearVelocity.isFinite());
+	PX_ASSERT(b11.angularState.isFinite());
+	PX_ASSERT(b21.linearVelocity.isFinite());
+	PX_ASSERT(b21.angularState.isFinite());
+	PX_ASSERT(b31.linearVelocity.isFinite());
+	PX_ASSERT(b31.angularState.isFinite());
+
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+	V4StoreA(angState30, &b30.angularState.x);
+
+	if(desc[0].bodyBDataIndex != 0)
+	{
+		V4StoreA(linVel01, &b01.linearVelocity.x);
+		V4StoreA(angState01, &b01.angularState.x);
+	}
+	if(desc[1].bodyBDataIndex != 0)
+	{
+		V4StoreA(linVel11, &b11.linearVelocity.x);
+		V4StoreA(angState11, &b11.angularState.x);
+	}
+	if(desc[2].bodyBDataIndex != 0)
+	{
+		V4StoreA(linVel21, &b21.linearVelocity.x);
+		V4StoreA(angState21, &b21.angularState.x);
+	}
+	if(desc[3].bodyBDataIndex != 0)
+	{
+		V4StoreA(linVel31, &b31.linearVelocity.x);
+		V4StoreA(angState31, &b31.angularState.x);
+	}
+
+	PX_ASSERT(b00.linearVelocity.isFinite());
+	PX_ASSERT(b00.angularState.isFinite());
+	PX_ASSERT(b10.linearVelocity.isFinite());
+	PX_ASSERT(b10.angularState.isFinite());
+	PX_ASSERT(b20.linearVelocity.isFinite());
+	PX_ASSERT(b20.angularState.isFinite());
+	PX_ASSERT(b30.linearVelocity.isFinite());
+	PX_ASSERT(b30.angularState.isFinite());
+
+	PX_ASSERT(b01.linearVelocity.isFinite());
+	PX_ASSERT(b01.angularState.isFinite());
+	PX_ASSERT(b11.linearVelocity.isFinite());
+	PX_ASSERT(b11.angularState.isFinite());
+	PX_ASSERT(b21.linearVelocity.isFinite());
+	PX_ASSERT(b21.angularState.isFinite());
+	PX_ASSERT(b31.linearVelocity.isFinite());
+	PX_ASSERT(b31.angularState.isFinite());
+}
+
+static void solveContact4_StaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& cache)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b30 = *desc[3].bodyA;
+
+	const PxU8* PX_RESTRICT last = desc[0].constraint + getConstraintLength(desc[0]);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+
+
+	//We'll need this.
+	const Vec4V vZero	= V4Zero();
+	Vec4V vMax	= V4Splat(FMax());
+	
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+
+	const PxU8* PX_RESTRICT prefetchAddress = currPtr + sizeof(SolverContactHeader4) + sizeof(SolverContactBatchPointBase4);
+
+	const SolverContactHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeader4*>(currPtr);
+
+	const Vec4V invMass0 = hdr->invMass0D0;
+
+	while((currPtr < last))
+	{
+		hdr = reinterpret_cast<const SolverContactHeader4*>(currPtr);
+
+		PX_ASSERT(hdr->type == DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT);
+		
+		currPtr = const_cast<PxU8*>(reinterpret_cast<const PxU8*>(hdr + 1));
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+		bool hasMaxImpulse = (hdr->flag & SolverContactHeader4::eHAS_MAX_IMPULSE) != 0;
+
+		Vec4V* appliedForces = reinterpret_cast<Vec4V*>(currPtr);
+		currPtr += sizeof(Vec4V)*numNormalConstr;
+
+		SolverContactBatchPointBase4* PX_RESTRICT contacts = reinterpret_cast<SolverContactBatchPointBase4*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(contacts + numNormalConstr);
+
+		Vec4V* maxImpulses;
+		PxU32 maxImpulseMask;
+		if(hasMaxImpulse)
+		{
+			maxImpulseMask = 0xFFFFFFFF;
+			maxImpulses = reinterpret_cast<Vec4V*>(currPtr);
+			currPtr += sizeof(Vec4V) * numNormalConstr;
+		}
+		else
+		{
+			maxImpulseMask = 0;
+			maxImpulses = &vMax;
+		}
+
+		SolverFrictionSharedData4* PX_RESTRICT fd = reinterpret_cast<SolverFrictionSharedData4*>(currPtr);
+		if(numFrictionConstr)
+			currPtr += sizeof(SolverFrictionSharedData4);
+
+		Vec4V* frictionAppliedForces = reinterpret_cast<Vec4V*>(currPtr);
+		currPtr += sizeof(Vec4V)*numFrictionConstr;
+
+		const SolverContactFrictionBase4* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionBase4*>(currPtr);
+		currPtr += numFrictionConstr * sizeof(SolverContactFrictionBase4);
+
+		
+		Vec4V accumulatedNormalImpulse = vZero;
+
+		const Vec4V angD0 = hdr->angDom0;
+		const Vec4V _normalT0 = hdr->normalX;
+		const Vec4V _normalT1 = hdr->normalY;
+		const Vec4V _normalT2 = hdr->normalZ;
+
+		Vec4V contactNormalVel1 = V4Mul(linVel0T0, _normalT0);
+		contactNormalVel1 = V4MulAdd(linVel0T1, _normalT1, contactNormalVel1);
+
+		contactNormalVel1 = V4MulAdd(linVel0T2, _normalT2, contactNormalVel1);
+
+		Vec4V accumDeltaF = vZero;
+
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			const SolverContactBatchPointBase4& c = contacts[i];
+
+			PxU32 offset = 0;
+			Ps::prefetchLine(prefetchAddress, offset += 64);
+			Ps::prefetchLine(prefetchAddress, offset += 64);
+			Ps::prefetchLine(prefetchAddress, offset += 64);
+			prefetchAddress += offset;
+
+			const Vec4V appliedForce = appliedForces[i];
+			const Vec4V maxImpulse = maxImpulses[i&maxImpulseMask];
+			Vec4V contactNormalVel2 = V4MulAdd(c.raXnX, angState0T0, contactNormalVel1);
+			contactNormalVel2 = V4MulAdd(c.raXnY, angState0T1, contactNormalVel2);
+			const Vec4V normalVel = V4MulAdd(c.raXnZ, angState0T2, contactNormalVel2);
+
+			const Vec4V _deltaF = V4Max(V4NegMulSub(normalVel, c.velMultiplier, c.biasedErr), V4Neg(appliedForce));
+
+			Vec4V newAppliedForce(V4Add(appliedForce, _deltaF));
+			newAppliedForce = V4Min(newAppliedForce, maxImpulse);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+			const Vec4V angDeltaF = V4Mul(angD0, deltaF);
+
+			accumDeltaF = V4Add(accumDeltaF, deltaF);
+
+			contactNormalVel1 = V4MulAdd(invMass0, deltaF, contactNormalVel1);
+			angState0T0 = V4MulAdd(c.raXnX, angDeltaF, angState0T0);
+			angState0T1 = V4MulAdd(c.raXnY, angDeltaF, angState0T1);
+			angState0T2 = V4MulAdd(c.raXnZ, angDeltaF, angState0T2);
+			
+#if 1
+			appliedForces[i] = newAppliedForce;
+#endif
+			
+			accumulatedNormalImpulse = V4Add(accumulatedNormalImpulse, newAppliedForce);
+		}	
+
+		const Vec4V deltaFInvMass0 = V4Mul(accumDeltaF, invMass0);
+
+		linVel0T0 = V4MulAdd(_normalT0, deltaFInvMass0, linVel0T0);
+		linVel0T1 = V4MulAdd(_normalT1, deltaFInvMass0, linVel0T1);
+		linVel0T2 = V4MulAdd(_normalT2, deltaFInvMass0, linVel0T2);
+
+		if(cache.doFriction && numFrictionConstr)
+		{
+			const Vec4V staticFric = hdr->staticFriction;
+
+			const Vec4V dynamicFric = hdr->dynamicFriction;
+
+			const Vec4V maxFrictionImpulse = V4Mul(staticFric, accumulatedNormalImpulse);
+			const Vec4V maxDynFrictionImpulse = V4Mul(dynamicFric, accumulatedNormalImpulse);
+			const Vec4V negMaxDynFrictionImpulse = V4Neg(maxDynFrictionImpulse);
+
+			BoolV broken = BFFFF();
+
+			if(cache.writeBackIteration)
+			{
+				Ps::prefetchLine(fd->frictionBrokenWritebackByte[0]);
+				Ps::prefetchLine(fd->frictionBrokenWritebackByte[1]);
+				Ps::prefetchLine(fd->frictionBrokenWritebackByte[2]);
+				Ps::prefetchLine(fd->frictionBrokenWritebackByte[3]);
+			}
+
+			for(PxU32 i=0;i<numFrictionConstr;i++)
+			{
+				const SolverContactFrictionBase4& f = frictions[i];
+
+				PxU32 offset = 0;
+				Ps::prefetchLine(prefetchAddress, offset += 64);
+				Ps::prefetchLine(prefetchAddress, offset += 64);
+				Ps::prefetchLine(prefetchAddress, offset += 64);
+				prefetchAddress += offset;
+
+				const Vec4V appliedForce = frictionAppliedForces[i];
+
+				const Vec4V normalT0 = fd->normalX[i&1];
+				const Vec4V normalT1 = fd->normalY[i&1];
+				const Vec4V normalT2 = fd->normalZ[i&1];
+
+				Vec4V normalVel1 = V4Mul(linVel0T0, normalT0);
+				Vec4V normalVel2 = V4Mul(f.raXnX, angState0T0);
+
+				normalVel1 = V4MulAdd(linVel0T1, normalT1, normalVel1);
+				normalVel2 = V4MulAdd(f.raXnY, angState0T1, normalVel2);
+
+				normalVel1 = V4MulAdd(linVel0T2, normalT2, normalVel1);
+				normalVel2 = V4MulAdd(f.raXnZ, angState0T2, normalVel2);
+
+				//relative normal velocity for all 4 constraints
+				const Vec4V normalVel = V4Add(normalVel1, normalVel2);
+
+				// appliedForce -bias * velMultiplier - a hoisted part of the total impulse computation
+				const Vec4V tmp1 = V4Sub(appliedForce, f.scaledBias); 
+
+				const Vec4V totalImpulse = V4NegMulSub(normalVel, f.velMultiplier, tmp1);
+
+				broken = BOr(broken, V4IsGrtr(V4Abs(totalImpulse), maxFrictionImpulse));
+
+				const Vec4V newAppliedForce = V4Sel(broken, V4Min(maxDynFrictionImpulse, V4Max(negMaxDynFrictionImpulse, totalImpulse)), totalImpulse);
+
+				const Vec4V deltaF =V4Sub(newAppliedForce, appliedForce);
+
+				const Vec4V deltaFInvMass = V4Mul(invMass0, deltaF);
+				const Vec4V angDeltaF = V4Mul(angD0, deltaF);
+
+				linVel0T0 = V4MulAdd(normalT0, deltaFInvMass, linVel0T0);
+				angState0T0 = V4MulAdd(f.raXnX, angDeltaF, angState0T0);
+
+				linVel0T1 = V4MulAdd(normalT1, deltaFInvMass, linVel0T1);
+				angState0T1 = V4MulAdd(f.raXnY, angDeltaF, angState0T1);
+
+				linVel0T2 = V4MulAdd(normalT2, deltaFInvMass, linVel0T2);
+				angState0T2 = V4MulAdd(f.raXnZ, angDeltaF, angState0T2);
+
+#if 1
+				frictionAppliedForces[i] = newAppliedForce;
+#endif
+
+			}
+
+			fd->broken = broken;
+		}
+	}
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+
+	PX_ASSERT(b00.linearVelocity.isFinite());
+	PX_ASSERT(b00.angularState.isFinite());
+	PX_ASSERT(b10.linearVelocity.isFinite());
+	PX_ASSERT(b10.angularState.isFinite());
+	PX_ASSERT(b20.linearVelocity.isFinite());
+	PX_ASSERT(b20.angularState.isFinite());
+	PX_ASSERT(b30.linearVelocity.isFinite());
+	PX_ASSERT(b30.angularState.isFinite());
+
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+
+	PX_ASSERT(b00.linearVelocity.isFinite());
+	PX_ASSERT(b00.angularState.isFinite());
+	PX_ASSERT(b10.linearVelocity.isFinite());
+	PX_ASSERT(b10.angularState.isFinite());
+	PX_ASSERT(b20.linearVelocity.isFinite());
+	PX_ASSERT(b20.angularState.isFinite());
+	PX_ASSERT(b30.linearVelocity.isFinite());
+	PX_ASSERT(b30.angularState.isFinite());
+}
+
+static void concludeContact4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/, PxU32 contactSize, PxU32 frictionSize)
+{
+	const PxU8* PX_RESTRICT last = desc[0].constraint + getConstraintLength(desc[0]);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+
+	while((currPtr < last))
+	{
+		const SolverContactHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeader4*>(currPtr);
+		
+		currPtr = const_cast<PxU8*>(reinterpret_cast<const PxU8*>(hdr + 1));
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		currPtr += sizeof(Vec4V)*numNormalConstr;
+
+		SolverContactBatchPointBase4* PX_RESTRICT contacts = reinterpret_cast<SolverContactBatchPointBase4*>(currPtr);
+		currPtr += (numNormalConstr * contactSize);
+		bool hasMaxImpulse = (hdr->flag & SolverContactHeader4::eHAS_MAX_IMPULSE) != 0;
+
+		if(hasMaxImpulse)
+			currPtr += sizeof(Vec4V) * numNormalConstr;
+
+		currPtr += sizeof(Vec4V)*numFrictionConstr;
+
+		SolverFrictionSharedData4* PX_RESTRICT fd = reinterpret_cast<SolverFrictionSharedData4*>(currPtr);
+		if(numFrictionConstr)
+			currPtr += sizeof(SolverFrictionSharedData4);
+		PX_UNUSED(fd);
+
+		SolverContactFrictionBase4* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionBase4*>(currPtr);
+		currPtr += (numFrictionConstr * frictionSize);
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContactBatchPointBase4& c = *contacts;
+			contacts = reinterpret_cast<SolverContactBatchPointBase4*>((reinterpret_cast<PxU8*>(contacts)) + contactSize);
+			c.biasedErr = V4Sub(c.biasedErr, c.scaledBias);
+		}	
+
+		for(PxU32 i=0;i<numFrictionConstr;i++)
+		{
+			SolverContactFrictionBase4& f = *frictions;
+			frictions = reinterpret_cast<SolverContactFrictionBase4*>((reinterpret_cast<PxU8*>(frictions)) + frictionSize);
+			f.scaledBias = f.targetVelocity;
+		}
+	}
+}
+
+void writeBackContact4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& cache,
+							 const PxSolverBodyData** PX_RESTRICT bd0, const PxSolverBodyData** PX_RESTRICT bd1)
+{
+	const PxU8* PX_RESTRICT last = desc[0].constraint + getConstraintLength(desc[0]);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+	PxReal* PX_RESTRICT vForceWriteback0 = reinterpret_cast<PxReal*>(desc[0].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback1 = reinterpret_cast<PxReal*>(desc[1].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback2 = reinterpret_cast<PxReal*>(desc[2].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback3 = reinterpret_cast<PxReal*>(desc[3].writeBack);
+
+	const PxU8 type = *desc[0].constraint;
+	const PxU32 contactSize = type == DY_SC_TYPE_BLOCK_RB_CONTACT ? sizeof(SolverContactBatchPointDynamic4) : sizeof(SolverContactBatchPointBase4);
+	const PxU32 frictionSize = type == DY_SC_TYPE_BLOCK_RB_CONTACT ? sizeof(SolverContactFrictionDynamic4) : sizeof(SolverContactFrictionBase4);
+
+
+	Vec4V normalForce = V4Zero();
+
+
+	//We'll need this.
+	//const Vec4V vZero	= V4Zero();
+
+	bool writeBackThresholds[4] = {false, false, false, false};
+
+	while((currPtr < last))
+	{
+		SolverContactHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeader4*>(currPtr);
+		
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);		
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		Vec4V* PX_RESTRICT appliedForces = reinterpret_cast<Vec4V*>(currPtr);
+		currPtr += sizeof(Vec4V)*numNormalConstr;
+
+		//SolverContactBatchPointBase4* PX_RESTRICT contacts = (SolverContactBatchPointBase4*)currPtr;
+		currPtr += (numNormalConstr * contactSize);
+
+		bool hasMaxImpulse = (hdr->flag & SolverContactHeader4::eHAS_MAX_IMPULSE) != 0;
+
+		if(hasMaxImpulse)
+			currPtr += sizeof(Vec4V) * numNormalConstr;
+
+		SolverFrictionSharedData4* PX_RESTRICT fd = reinterpret_cast<SolverFrictionSharedData4*>(currPtr);
+		if(numFrictionConstr)
+			currPtr += sizeof(SolverFrictionSharedData4);
+
+		currPtr += sizeof(Vec4V)*numFrictionConstr;
+
+		//SolverContactFrictionBase4* PX_RESTRICT frictions = (SolverContactFrictionBase4*)currPtr;
+		currPtr += (numFrictionConstr * frictionSize);
+
+		writeBackThresholds[0] = hdr->flags[0] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[1] = hdr->flags[1] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[2] = hdr->flags[2] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[3] = hdr->flags[3] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			//contacts = (SolverContactBatchPointBase4*)(((PxU8*)contacts) + contactSize);
+			const FloatV appliedForce0 = V4GetX(appliedForces[i]);
+			const FloatV appliedForce1 = V4GetY(appliedForces[i]);
+			const FloatV appliedForce2 = V4GetZ(appliedForces[i]);
+			const FloatV appliedForce3 = V4GetW(appliedForces[i]);
+
+			normalForce = V4Add(normalForce, appliedForces[i]);
+
+			if(vForceWriteback0 && i < hdr->numNormalConstr0)
+				FStore(appliedForce0, vForceWriteback0++);
+			if(vForceWriteback1 && i < hdr->numNormalConstr1)
+				FStore(appliedForce1, vForceWriteback1++);
+			if(vForceWriteback2 && i < hdr->numNormalConstr2)
+				FStore(appliedForce2, vForceWriteback2++);
+			if(vForceWriteback3 && i < hdr->numNormalConstr3)
+				FStore(appliedForce3, vForceWriteback3++);
+		}	
+
+		if(numFrictionConstr)
+		{
+			PX_ALIGN(16, PxU32 broken[4]);
+			BStoreA(fd->broken, broken);
+
+			PxU8* frictionCounts = &hdr->numFrictionConstr0;
+
+			for(PxU32 a = 0; a < 4; ++a)
+			{
+				if(frictionCounts[a] && broken[a])
+					*fd->frictionBrokenWritebackByte[a] = 1;	// PT: bad L2 miss here
+			}
+		}
+	}
+
+	PX_ALIGN(16, PxReal nf[4]);
+	V4StoreA(normalForce, nf);
+
+	Sc::ShapeInteraction** shapeInteractions = reinterpret_cast<SolverContactHeader4*>(desc[0].constraint)->shapeInteraction;
+
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		if(writeBackThresholds[a] && desc[a].linkIndexA == PxSolverConstraintDesc::NO_LINK && desc[a].linkIndexB == PxSolverConstraintDesc::NO_LINK &&
+			nf[a] !=0.f && (bd0[a]->reportThreshold < PX_MAX_REAL  || bd1[a]->reportThreshold < PX_MAX_REAL))
+		{
+			ThresholdStreamElement elt;
+			elt.normalForce = nf[a];
+			elt.threshold = PxMin<float>(bd0[a]->reportThreshold, bd1[a]->reportThreshold);
+			elt.nodeIndexA = bd0[a]->nodeIndex;
+			elt.nodeIndexB = bd1[a]->nodeIndex;
+			elt.shapeInteraction = shapeInteractions[a];
+			Ps::order(elt.nodeIndexA, elt.nodeIndexB);
+			PX_ASSERT(elt.nodeIndexA < elt.nodeIndexB);
+			PX_ASSERT(cache.mThresholdStreamIndex<cache.mThresholdStreamLength);
+			cache.mThresholdStream[cache.mThresholdStreamIndex++] = elt;
+		}
+	}
+}
+
+static void solve1D4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b01 = *desc[0].bodyB;
+
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b11 = *desc[1].bodyB;
+
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b21 = *desc[2].bodyB;
+
+	PxSolverBody& b30 = *desc[3].bodyA;
+	PxSolverBody& b31 = *desc[3].bodyB;
+
+	PxU8* PX_RESTRICT bPtr = desc[0].constraint;
+	//PxU32 length = desc.constraintLength;
+
+	SolverConstraint1DHeader4* PX_RESTRICT  header = reinterpret_cast<SolverConstraint1DHeader4*>(bPtr);
+	SolverConstraint1DDynamic4* PX_RESTRICT base = reinterpret_cast<SolverConstraint1DDynamic4*>(header+1);
+
+	//const FloatV fZero = FZero();
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+	Vec4V angState01 = V4LoadA(&b01.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+	Vec4V angState11 = V4LoadA(&b11.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+	Vec4V angState21 = V4LoadA(&b21.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+	Vec4V angState31 = V4LoadA(&b31.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V linVel1T0, linVel1T1, linVel1T2, linVel1T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+	Vec4V angState1T0, angState1T1, angState1T2, angState1T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2, linVel1T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	PX_TRANSPOSE_44(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2, angState1T3);
+
+	const Vec4V	invMass0D0 = header->invMass0D0;
+	const Vec4V	invMass1D1 = header->invMass1D1;
+
+	const Vec4V	angD0 = header->angD0;
+	const Vec4V	angD1 = header->angD1;
+
+	PxU32 maxConstraints = header->count;
+
+	for(PxU32 a = 0; a < maxConstraints; ++a)
+	{
+		SolverConstraint1DDynamic4& c = *base;
+		base++;
+
+		Ps::prefetchLine(base);
+		Ps::prefetchLine(base, 64);
+		Ps::prefetchLine(base, 128);
+		Ps::prefetchLine(base, 192);
+		Ps::prefetchLine(base, 256);
+		
+		const Vec4V appliedForce = c.appliedForce;
+
+		Vec4V linProj0(V4Mul(c.lin0X, linVel0T0));
+		Vec4V linProj1(V4Mul(c.lin1X, linVel1T0));
+		Vec4V angProj0(V4Mul(c.ang0X, angState0T0));
+		Vec4V angProj1(V4Mul(c.ang1X, angState1T0));
+
+		linProj0 = V4MulAdd(c.lin0Y, linVel0T1, linProj0);
+		linProj1 = V4MulAdd(c.lin1Y, linVel1T1, linProj1);
+		angProj0 = V4MulAdd(c.ang0Y, angState0T1, angProj0);
+		angProj1 = V4MulAdd(c.ang1Y, angState1T1, angProj1);
+		
+		linProj0 = V4MulAdd(c.lin0Z, linVel0T2, linProj0);
+		linProj1 = V4MulAdd(c.lin1Z, linVel1T2, linProj1);
+		angProj0 = V4MulAdd(c.ang0Z, angState0T2, angProj0);
+		angProj1 = V4MulAdd(c.ang1Z, angState1T2, angProj1);
+
+		const Vec4V projectVel0 = V4Add(linProj0, angProj0);
+		const Vec4V projectVel1 = V4Add(linProj1, angProj1);
+		
+		const Vec4V normalVel = V4Sub(projectVel0, projectVel1);
+
+		const Vec4V unclampedForce = V4MulAdd(appliedForce, c.impulseMultiplier, V4MulAdd(normalVel, c.velMultiplier, c.constant));
+		const Vec4V clampedForce = V4Max(c.minImpulse, V4Min(c.maxImpulse, unclampedForce));
+		const Vec4V deltaF = V4Sub(clampedForce, appliedForce);
+		c.appliedForce = clampedForce;
+
+		const Vec4V deltaFInvMass0 = V4Mul(deltaF, invMass0D0);
+		const Vec4V deltaFInvMass1 = V4Mul(deltaF, invMass1D1);
+
+		const Vec4V angDeltaFInvMass0 = V4Mul(deltaF, angD0);
+		const Vec4V angDeltaFInvMass1 = V4Mul(deltaF, angD1);
+
+		linVel0T0 = V4MulAdd(c.lin0X, deltaFInvMass0, linVel0T0);
+		linVel1T0 = V4NegMulSub(c.lin1X, deltaFInvMass1, linVel1T0);
+		angState0T0 = V4MulAdd(c.ang0X, angDeltaFInvMass0, angState0T0);
+		angState1T0 = V4NegMulSub(c.ang1X, angDeltaFInvMass1, angState1T0);
+
+		linVel0T1 = V4MulAdd(c.lin0Y, deltaFInvMass0, linVel0T1);
+		linVel1T1 = V4NegMulSub(c.lin1Y, deltaFInvMass1, linVel1T1);
+		angState0T1 = V4MulAdd(c.ang0Y, angDeltaFInvMass0, angState0T1);
+		angState1T1 = V4NegMulSub(c.ang1Y, angDeltaFInvMass1, angState1T1);
+
+		linVel0T2 = V4MulAdd(c.lin0Z, deltaFInvMass0, linVel0T2);
+		linVel1T2 = V4NegMulSub(c.lin1Z, deltaFInvMass1, linVel1T2);
+		angState0T2 = V4MulAdd(c.ang0Z, angDeltaFInvMass0, angState0T2);
+		angState1T2 = V4NegMulSub(c.ang1Z, angDeltaFInvMass1, angState1T2);
+	}
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(linVel1T0, linVel1T1, linVel1T2, linVel1T3, linVel01, linVel11, linVel21, linVel31);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+	PX_TRANSPOSE_44(angState1T0, angState1T1, angState1T2, angState1T3, angState01, angState11, angState21, angState31);
+
+
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(linVel01, &b01.linearVelocity.x);
+	V4StoreA(linVel11, &b11.linearVelocity.x);
+	V4StoreA(linVel21, &b21.linearVelocity.x);
+	V4StoreA(linVel31, &b31.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+
+	V4StoreA(angState01, &b01.angularState.x);
+	V4StoreA(angState11, &b11.angularState.x);
+	V4StoreA(angState21, &b21.angularState.x);
+	V4StoreA(angState31, &b31.angularState.x);
+	
+}
+
+static void conclude1D4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+	SolverConstraint1DHeader4* header = reinterpret_cast<SolverConstraint1DHeader4*>(desc[0].constraint);
+	PxU8* base = desc[0].constraint + sizeof(SolverConstraint1DHeader4);
+	PxU32 stride = header->type == DY_SC_TYPE_BLOCK_1D ? sizeof(SolverConstraint1DDynamic4) : sizeof(SolverConstraint1DBase4);
+
+	for(PxU32 i=0; i<header->count; i++)
+	{
+		SolverConstraint1DBase4& c = *reinterpret_cast<SolverConstraint1DBase4*>(base);
+		c.constant = c.unbiasedConstant;
+		base += stride;
+	}
+	PX_ASSERT(desc[0].constraint + getConstraintLength(desc[0]) == base);
+}
+
+void writeBack1D4(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/,
+							 const PxSolverBodyData** PX_RESTRICT /*bd0*/, const PxSolverBodyData** PX_RESTRICT /*bd1*/)
+{
+	ConstraintWriteback* writeback0 = reinterpret_cast<ConstraintWriteback*>(desc[0].writeBack);
+	ConstraintWriteback* writeback1 = reinterpret_cast<ConstraintWriteback*>(desc[1].writeBack);
+	ConstraintWriteback* writeback2 = reinterpret_cast<ConstraintWriteback*>(desc[2].writeBack);
+	ConstraintWriteback* writeback3 = reinterpret_cast<ConstraintWriteback*>(desc[3].writeBack);
+
+	if(writeback0 || writeback1 || writeback2 || writeback3)
+	{
+		SolverConstraint1DHeader4* header = reinterpret_cast<SolverConstraint1DHeader4*>(desc[0].constraint);
+		PxU8* base = desc[0].constraint + sizeof(SolverConstraint1DHeader4);
+		PxU32 stride = header->type == DY_SC_TYPE_BLOCK_1D ? sizeof(SolverConstraint1DDynamic4) : sizeof(SolverConstraint1DBase4);
+
+		const Vec4V zero = V4Zero();
+		Vec4V linX(zero), linY(zero), linZ(zero); 
+		Vec4V angX(zero), angY(zero), angZ(zero); 
+
+		for(PxU32 i=0; i<header->count; i++)
+		{
+			const SolverConstraint1DBase4* c = reinterpret_cast<SolverConstraint1DBase4*>(base);
+
+			//Load in flags
+			const VecI32V flags = I4LoadU(reinterpret_cast<const PxI32*>(&c->flags[0]));
+			//Work out masks
+			const VecI32V mask = I4Load(DY_SC_FLAG_OUTPUT_FORCE);
+
+			const VecI32V masked = VecI32V_And(flags, mask);
+			const BoolV isEq = VecI32V_IsEq(masked, mask);
+
+			const Vec4V appliedForce = V4Sel(isEq, c->appliedForce, zero);
+
+			linX = V4MulAdd(c->lin0X, appliedForce, linX);
+			linY = V4MulAdd(c->lin0Y, appliedForce, linY);
+			linZ = V4MulAdd(c->lin0Z, appliedForce, linZ);
+
+			angX = V4MulAdd(c->ang0WritebackX, appliedForce, angX);
+			angY = V4MulAdd(c->ang0WritebackY, appliedForce, angY);
+			angZ = V4MulAdd(c->ang0WritebackZ, appliedForce, angZ);
+
+			base += stride;
+		}
+
+		//We need to do the cross product now
+
+		angX = V4Sub(angX, V4NegMulSub(header->body0WorkOffsetZ, linY, V4Mul(header->body0WorkOffsetY, linZ)));
+		angY = V4Sub(angY, V4NegMulSub(header->body0WorkOffsetX, linZ, V4Mul(header->body0WorkOffsetZ, linX)));
+		angZ = V4Sub(angZ, V4NegMulSub(header->body0WorkOffsetY, linX, V4Mul(header->body0WorkOffsetX, linY)));
+
+		const Vec4V linLenSq = V4MulAdd(linZ, linZ, V4MulAdd(linY, linY, V4Mul(linX, linX)));
+		const Vec4V angLenSq = V4MulAdd(angZ, angZ, V4MulAdd(angY, angY, V4Mul(angX, angX)));
+
+		const Vec4V linLen = V4Sqrt(linLenSq);
+		const Vec4V angLen = V4Sqrt(angLenSq);
+
+		const BoolV broken = BOr(V4IsGrtr(linLen, header->linBreakImpulse), V4IsGrtr(angLen, header->angBreakImpulse));
+
+		PX_ALIGN(16, PxU32 iBroken[4]);
+		BStoreA(broken, iBroken);
+
+		Vec4V lin0, lin1, lin2, lin3;
+		Vec4V ang0, ang1, ang2, ang3;
+
+		PX_TRANSPOSE_34_44(linX, linY, linZ, lin0, lin1, lin2, lin3);
+		PX_TRANSPOSE_34_44(angX, angY, angZ, ang0, ang1, ang2, ang3);
+
+		if(writeback0)
+		{
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(lin0), writeback0->linearImpulse);
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(ang0), writeback0->angularImpulse);
+			writeback0->broken = header->break0 ? PxU32(iBroken[0] != 0) : 0;
+		}
+		if(writeback1)
+		{
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(lin1), writeback1->linearImpulse);
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(ang1), writeback1->angularImpulse);
+			writeback1->broken = header->break1 ? PxU32(iBroken[1] != 0) : 0;
+		}
+		if(writeback2)
+		{
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(lin2), writeback2->linearImpulse);
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(ang2), writeback2->angularImpulse);
+			writeback2->broken = header->break2 ? PxU32(iBroken[2] != 0) : 0;
+		}
+		if(writeback3)
+		{
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(lin3), writeback3->linearImpulse);
+			V3StoreU(Vec3V_From_Vec4V_WUndefined(ang3), writeback3->angularImpulse);
+			writeback3->broken = header->break3 ? PxU32(iBroken[3] != 0) : 0;
+		}
+
+		PX_ASSERT(desc[0].constraint + getConstraintLength(desc[0]) == base);
+	}
+}
+
+
+void solveContactPreBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 /*constraintCount*/, SolverContext& cache)
+{
+	solveContact4_Block(desc, cache);
+}
+
+void solveContactPreBlock_Static(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContact4_StaticBlock(desc, cache);
+}
+
+void solveContactPreBlock_Conclude(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContact4_Block(desc, cache);
+	concludeContact4_Block(desc, cache, sizeof(SolverContactBatchPointDynamic4), sizeof(SolverContactFrictionDynamic4));
+}
+
+void solveContactPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContact4_StaticBlock(desc, cache);
+	concludeContact4_Block(desc, cache, sizeof(SolverContactBatchPointBase4), sizeof(SolverContactFrictionBase4));
+}
+
+void solveContactPreBlock_WriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContact4_Block(desc, cache);
+
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+	writeBackContact4_Block(desc, cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveContactPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContact4_StaticBlock(desc, cache);
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+	writeBackContact4_Block(desc, cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solve1D4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solve1D4_Block(desc, cache);
+}
+
+
+void solve1D4Block_Conclude(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solve1D4_Block(desc, cache);
+	conclude1D4_Block(desc, cache);
+}
+
+
+void solve1D4Block_WriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solve1D4_Block(desc, cache);
+
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+	writeBack1D4(desc, cache, bd0, bd1);
+}
+
+void writeBack1D4Block(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+	writeBack1D4(desc, cache, bd0, bd1);
+}
+
+}
+
+}
+
+#endif
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintsShared.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintsShared.h
new file mode 100644
index 00000000..13c8a0e2
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverConstraintsShared.h
@@ -0,0 +1,221 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#ifndef DY_SOLVER_CORE_SHARED_H
+#define DY_SOLVER_CORE_SHARED_H
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+
+#ifdef PX_SUPPORT_SIMD
+
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContact.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+
+
+namespace physx
+{
+
+namespace Dy
+{
+	PX_FORCE_INLINE static FloatV solveDynamicContacts(SolverContactPoint* contacts, const PxU32 nbContactPoints, const Vec3VArg contactNormal,
+	const FloatVArg invMassA, const FloatVArg invMassB, const FloatVArg angDom0, const FloatVArg angDom1, Vec3V& linVel0_, Vec3V& angState0_, 
+	Vec3V& linVel1_, Vec3V& angState1_, PxF32* PX_RESTRICT forceBuffer)
+{
+	Vec3V linVel0 = linVel0_;
+	Vec3V angState0 = angState0_;
+	Vec3V linVel1 = linVel1_;
+	Vec3V angState1 = angState1_;
+	FloatV accumulatedNormalImpulse = FZero();
+
+	const Vec3V delLinVel0 = V3Scale(contactNormal, invMassA);
+	const Vec3V delLinVel1 = V3Scale(contactNormal, invMassB);
+
+	for(PxU32 i=0;i<nbContactPoints;i++)
+	{
+		SolverContactPoint& c = contacts[i];
+		Ps::prefetchLine(&contacts[i], 128);
+
+		const Vec3V raXn = c.raXn;
+
+		const Vec3V rbXn = c.rbXn;
+
+		const FloatV appliedForce = FLoad(forceBuffer[i]);
+		const FloatV velMultiplier = c.getVelMultiplier();
+		
+		/*const FloatV targetVel = c.getTargetVelocity();
+		const FloatV nScaledBias = c.getScaledBias();*/
+		const FloatV maxImpulse = c.getMaxImpulse();
+
+		//Compute the normal velocity of the constraint.
+		const Vec3V v0 = V3MulAdd(linVel0, contactNormal, V3Mul(angState0, raXn));
+		const Vec3V v1 = V3MulAdd(linVel1, contactNormal, V3Mul(angState1, rbXn));
+		const FloatV normalVel = V3SumElems(V3Sub(v0, v1));
+
+		const FloatV biasedErr = c.getBiasedErr();//FScaleAdd(targetVel, velMultiplier, nScaledBias);
+
+		//KS - clamp the maximum force
+		const FloatV _deltaF = FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const FloatV _newForce = FAdd(appliedForce, _deltaF);
+		const FloatV newForce = FMin(_newForce, maxImpulse);
+		const FloatV deltaF = FSub(newForce, appliedForce);
+
+		linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+		linVel1 = V3NegScaleSub(delLinVel1, deltaF, linVel1);
+		angState0 = V3ScaleAdd(raXn, FMul(deltaF, angDom0), angState0);
+		angState1 = V3NegScaleSub(rbXn, FMul(deltaF, angDom1), angState1);
+		
+		FStore(newForce, &forceBuffer[i]);
+
+		accumulatedNormalImpulse = FAdd(accumulatedNormalImpulse, newForce);
+	}
+
+	linVel0_ = linVel0;
+	angState0_ = angState0;
+	linVel1_ = linVel1;
+	angState1_ = angState1;
+	return accumulatedNormalImpulse;
+}
+
+PX_FORCE_INLINE static FloatV solveStaticContacts(SolverContactPoint* contacts, const PxU32 nbContactPoints, const Vec3VArg contactNormal,
+	const FloatVArg invMassA, const FloatVArg angDom0, Vec3V& linVel0_, Vec3V& angState0_, PxF32* PX_RESTRICT forceBuffer)
+{
+	Vec3V linVel0 = linVel0_;
+	Vec3V angState0 = angState0_;
+	FloatV accumulatedNormalImpulse = FZero();
+
+	const Vec3V delLinVel0 = V3Scale(contactNormal, invMassA);
+
+	for(PxU32 i=0;i<nbContactPoints;i++)
+	{
+		SolverContactPoint& c = contacts[i];
+		Ps::prefetchLine(&contacts[i],128);
+
+		const Vec3V raXn = c.raXn;
+		
+		const FloatV appliedForce = FLoad(forceBuffer[i]);
+		const FloatV velMultiplier = c.getVelMultiplier();
+
+		/*const FloatV targetVel = c.getTargetVelocity();
+		const FloatV nScaledBias = c.getScaledBias();*/
+		const FloatV maxImpulse = c.getMaxImpulse();
+		
+		const Vec3V v0 = V3MulAdd(linVel0, contactNormal, V3Mul(angState0, raXn));
+		const FloatV normalVel = V3SumElems(v0);
+
+
+		const FloatV biasedErr = c.getBiasedErr();//FScaleAdd(targetVel, velMultiplier, nScaledBias);
+
+		// still lots to do here: using loop pipelining we can interweave this code with the
+		// above - the code here has a lot of stalls that we would thereby eliminate
+		const FloatV _deltaF = FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const FloatV _newForce = FAdd(appliedForce, _deltaF);
+		const FloatV newForce = FMin(_newForce, maxImpulse);
+		const FloatV deltaF = FSub(newForce, appliedForce);
+
+		linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+		angState0 = V3ScaleAdd(raXn, FMul(deltaF, angDom0), angState0);
+
+		FStore(newForce, &forceBuffer[i]);
+
+		accumulatedNormalImpulse = FAdd(accumulatedNormalImpulse, newForce);
+	}
+
+	linVel0_ = linVel0;
+	angState0_ = angState0;
+	return accumulatedNormalImpulse;
+}
+
+PX_FORCE_INLINE static FloatV solveExtContacts(SolverContactPointExt* contacts, const PxU32 nbContactPoints, const Vec3VArg contactNormal,
+		Vec3V& linVel0, Vec3V& angVel0, 
+		Vec3V& linVel1, Vec3V& angVel1, 
+		Vec3V& li0, Vec3V& ai0,
+		Vec3V& li1, Vec3V& ai1, 
+		PxF32* PX_RESTRICT appliedForceBuffer)
+	{
+
+		FloatV accumulatedNormalImpulse = FZero();
+		for(PxU32 i=0;i<nbContactPoints;i++)
+		{
+			SolverContactPointExt& c = contacts[i];
+			Ps::prefetchLine(&contacts[i+1]);
+
+			const Vec3V raXn = c.raXn;
+			const Vec3V rbXn = c.rbXn;
+
+			const FloatV appliedForce = FLoad(appliedForceBuffer[i]);
+			const FloatV velMultiplier = c.getVelMultiplier();
+
+			/*const FloatV targetVel = c.getTargetVelocity();
+			const FloatV scaledBias = c.getScaledBias();*/
+
+			//Compute the normal velocity of the constraint.
+
+			Vec3V v = V3MulAdd(linVel0, contactNormal, V3Mul(angVel0, raXn));
+			v = V3Sub(v, V3MulAdd(linVel1, contactNormal, V3Mul(angVel1, rbXn)));
+			const FloatV normalVel = V3SumElems(v);
+			
+			const FloatV biasedErr = c.getBiasedErr();//FNeg(scaledBias);
+
+			// still lots to do here: using loop pipelining we can interweave this code with the
+			// above - the code here has a lot of stalls that we would thereby eliminate
+
+			const FloatV deltaF = FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+
+			linVel0 = V3ScaleAdd(c.linDeltaVA, deltaF, linVel0);	
+			angVel0 = V3ScaleAdd(c.angDeltaVA, deltaF, angVel0);
+			linVel1 = V3ScaleAdd(c.linDeltaVB, deltaF, linVel1);	
+			angVel1 = V3ScaleAdd(c.angDeltaVB, deltaF, angVel1);
+
+			li0 = V3ScaleAdd(contactNormal, deltaF, li0);	ai0 = V3ScaleAdd(raXn, deltaF, ai0);
+			li1 = V3ScaleAdd(contactNormal, deltaF, li1);	ai1 = V3ScaleAdd(rbXn, deltaF, ai1);
+
+			const FloatV newAppliedForce = FAdd(appliedForce, deltaF);
+
+			FStore(newAppliedForce, &appliedForceBuffer[i]);
+
+			accumulatedNormalImpulse = FAdd(accumulatedNormalImpulse, newAppliedForce);
+		}
+		return accumulatedNormalImpulse;
+	}
+
+}
+
+}
+
+#endif //PX_SUPPORT_SIMD
+
+#endif //DY_SOLVER_CORE_SHARED_H
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContact.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContact.h
new file mode 100644
index 00000000..f204633c
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContact.h
@@ -0,0 +1,228 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_SOLVERCONTACT_H
+#define DY_SOLVERCONTACT_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+using namespace Ps::aos;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+/**
+\brief A header to represent a friction patch for the solver.
+*/
+
+namespace Dy
+{
+
+struct SolverContactHeader
+{
+	enum DySolverContactFlags
+	{
+		eHAS_FORCE_THRESHOLDS = 0x1
+	};
+
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	flags;	
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;					//4
+
+	PxReal	angDom0;							//8
+	PxReal	angDom1;							//12
+	PxReal	invMass0;							//16
+
+	Vec4V   staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W;		//32
+	Vec3V	normal;															//48
+
+	PxReal	invMass1;														//52
+	PxU32 broken;															//56
+	PxU8* frictionBrokenWritebackByte;										//60	64
+	Sc::ShapeInteraction* shapeInteraction;									//64	72
+#if PX_P64_FAMILY
+	PxU32	pad[2];															//64	80
+#endif // PX_X64
+
+
+	PX_FORCE_INLINE void setStaticFriction(const FloatV f)	{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+	PX_FORCE_INLINE void setDynamicFriction(const FloatV f)	{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+	PX_FORCE_INLINE void setDominance0(const FloatV f)		{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+	PX_FORCE_INLINE void setDominance1(const FloatV f)		{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+
+	PX_FORCE_INLINE FloatV getStaticFriction() const		{return V4GetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE FloatV getDynamicFriction() const		{return V4GetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE FloatV getDominance0() const			{return V4GetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE FloatV getDominance1() const			{return V4GetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+
+	PX_FORCE_INLINE void setStaticFriction(PxF32 f)			{V4WriteX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+	PX_FORCE_INLINE void setDynamicFriction(PxF32 f)		{V4WriteY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+	PX_FORCE_INLINE void setDominance0(PxF32 f)				{V4WriteZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+	PX_FORCE_INLINE void setDominance1(PxF32 f)				{V4WriteW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+
+	PX_FORCE_INLINE PxF32 getStaticFrictionPxF32() const	{return V4ReadX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE PxF32 getDynamicFrictionPxF32() const	{return V4ReadY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE PxF32 getDominance0PxF32() const		{return V4ReadZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE PxF32 getDominance1PxF32() const		{return V4ReadW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+}; 
+
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader) == 64);
+#else
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader) == 80);
+#endif
+
+/**
+\brief A single rigid body contact point for the solver.
+*/
+struct SolverContactPoint
+{
+	Vec3V raXn;
+	Vec3V rbXn;
+
+	PxF32 velMultiplier;
+	PxF32 biasedErr;
+	PxF32 unbiasedErr;
+	PxF32 maxImpulse;
+
+	PX_FORCE_INLINE FloatV getVelMultiplier() const			{return FLoad(velMultiplier);}
+
+	PX_FORCE_INLINE FloatV getBiasedErr() const				{return FLoad(biasedErr);}
+	PX_FORCE_INLINE FloatV getMaxImpulse() const			{return FLoad(maxImpulse);}
+
+
+#ifdef PX_SUPPORT_SIMD
+	PX_FORCE_INLINE Vec3V getRaXn() const					{return raXn;}
+	PX_FORCE_INLINE Vec3V getRbXn() const					{return rbXn;}
+#endif
+
+	PX_FORCE_INLINE void setRaXn(const PxVec3& v)			{V3WriteXYZ(raXn, v);}
+	PX_FORCE_INLINE void setRbXn(const PxVec3& v)			{V3WriteXYZ(rbXn, v);}
+	PX_FORCE_INLINE void setVelMultiplier(PxF32 f)			{velMultiplier = f;}
+
+	PX_FORCE_INLINE void setBiasedErr(PxF32 f)				{biasedErr = f;}
+	PX_FORCE_INLINE void setUnbiasedErr(PxF32 f)			{unbiasedErr = f;}
+
+	PX_FORCE_INLINE PxF32 getVelMultiplierPxF32() const		{return velMultiplier;}
+	PX_FORCE_INLINE const PxVec3& getRaXnPxVec3() const		{return V3ReadXYZ(raXn);}
+	PX_FORCE_INLINE const PxVec3& getRbXnPxVec3() const		{return V3ReadXYZ(rbXn);}
+	PX_FORCE_INLINE PxF32 getBiasedErrPxF32() const			{return biasedErr;}
+}; 
+
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactPoint) == 48);
+
+/**
+\brief A single extended articulation contact point for the solver.
+*/
+struct SolverContactPointExt : public SolverContactPoint
+{
+	Vec3V linDeltaVA;
+	Vec3V angDeltaVA;
+	Vec3V linDeltaVB;
+	Vec3V angDeltaVB;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactPointExt) == 112);
+
+
+/**
+\brief A single friction constraint for the solver.
+*/
+struct SolverContactFriction
+{
+	Vec4V normalXYZ_appliedForceW;		//16
+	Vec4V raXnXYZ_velMultiplierW;		//32
+	Vec4V rbXnXYZ_biasW;				//48
+	PxReal targetVel;					//52
+	PxU32 mPad[3];						//64
+
+	PX_FORCE_INLINE void setAppliedForce(const FloatV f)	{normalXYZ_appliedForceW=V4SetW(normalXYZ_appliedForceW,f);}
+	PX_FORCE_INLINE void setVelMultiplier(const FloatV f)	{raXnXYZ_velMultiplierW=V4SetW(raXnXYZ_velMultiplierW,f);}
+	PX_FORCE_INLINE void setBias(const FloatV f)			{rbXnXYZ_biasW=V4SetW(rbXnXYZ_biasW,f);}
+
+	PX_FORCE_INLINE FloatV getAppliedForce() const			{return V4GetW(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE FloatV getVelMultiplier() const			{return V4GetW(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE FloatV getBias() const					{return V4GetW(rbXnXYZ_biasW);}
+
+#ifdef PX_SUPPORT_SIMD
+	PX_FORCE_INLINE Vec3V getNormal() const					{return Vec3V_From_Vec4V(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE Vec3V getRaXn() const					{return Vec3V_From_Vec4V(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE Vec3V getRbXn() const					{return Vec3V_From_Vec4V(rbXnXYZ_biasW);}
+#endif	
+
+	PX_FORCE_INLINE void setNormal(const PxVec3& v)			{V4WriteXYZ(normalXYZ_appliedForceW, v);}
+	PX_FORCE_INLINE void setRaXn(const PxVec3& v)			{V4WriteXYZ(raXnXYZ_velMultiplierW, v);}
+	PX_FORCE_INLINE void setRbXn(const PxVec3& v)			{V4WriteXYZ(rbXnXYZ_biasW, v);}
+
+	PX_FORCE_INLINE const PxVec3& getNormalPxVec3() const	{return V4ReadXYZ(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE const PxVec3& getRaXnPxVec3() const		{return V4ReadXYZ(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE const PxVec3& getRbXnPxVec3() const		{return V4ReadXYZ(rbXnXYZ_biasW);}
+
+	PX_FORCE_INLINE void setAppliedForce(PxF32 f)			{V4WriteW(normalXYZ_appliedForceW, f);}
+	PX_FORCE_INLINE void setVelMultiplier(PxF32 f)			{V4WriteW(raXnXYZ_velMultiplierW, f);}
+	PX_FORCE_INLINE void setBias(PxF32 f)					{V4WriteW(rbXnXYZ_biasW, f);}
+	
+	PX_FORCE_INLINE PxF32 getAppliedForcePxF32() const		{return V4ReadW(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE PxF32 getVelMultiplierPxF32() const		{return V4ReadW(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE PxF32 getBiasPxF32() const				{return V4ReadW(rbXnXYZ_biasW);}
+
+}; 
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFriction) == 64);
+
+/**
+\brief A single extended articulation friction constraint for the solver.
+*/
+struct SolverContactFrictionExt : public SolverContactFriction
+{
+	Vec3V linDeltaVA;
+	Vec3V angDeltaVA;
+	Vec3V linDeltaVB;
+	Vec3V angDeltaVB;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFrictionExt) == 128);
+
+}
+
+}
+
+
+
+#endif //DY_SOLVERCONTACT_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContact4.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContact4.h
new file mode 100644
index 00000000..31fc9a6d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContact4.h
@@ -0,0 +1,179 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVERCONTACT4_H
+#define DY_SOLVERCONTACT4_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+#include "DySolverContact.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+	
+namespace Dy
+{
+
+
+
+
+/**
+\brief Batched SOA contact data. Note, we don't support batching with extended contacts for the simple reason that handling multiple articulations would be complex.
+*/
+struct SolverContactHeader4
+{
+	enum
+	{
+		eHAS_MAX_IMPULSE = 1 << 0,
+		eHAS_TARGET_VELOCITY = 1 << 1
+	};
+
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;
+	PxU8	flag;
+
+	PxU8	flags[4];
+	//These counts are the max of the 4 sets of data.
+	//When certain pairs have fewer patches/contacts than others, they are padded with 0s so that no work is performed but 
+	//calculations are still shared (afterall, they're computationally free because we're doing 4 things at a time in SIMD)
+
+	//KS - used for write-back only
+	PxU8	numNormalConstr0, numNormalConstr1, numNormalConstr2, numNormalConstr3;
+	PxU8	numFrictionConstr0, numFrictionConstr1, numFrictionConstr2, numFrictionConstr3;			
+
+	Vec4V	restitution;																			
+	Vec4V   staticFriction;
+	Vec4V	dynamicFriction;
+	//Technically, these mass properties could be pulled out into a new structure and shared. For multi-manifold contacts,
+	//this would save 64 bytes per-manifold after the cost of the first manifold
+	Vec4V	invMass0D0;
+	Vec4V	invMass1D1;
+	Vec4V	angDom0;
+	Vec4V	angDom1;
+	//Normal is shared between all contacts in the batch. This will save some memory!
+	Vec4V normalX;
+	Vec4V normalY;
+	Vec4V normalZ;
+
+	Sc::ShapeInteraction* shapeInteraction[4];		//192 or 208
+}; 
+
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader4) == 192);
+#else
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader4) == 208);
+#endif
+
+
+/**
+\brief This represents a batch of 4 contacts with static rolled into a single structure
+*/
+struct SolverContactBatchPointBase4
+{
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V velMultiplier;
+	Vec4V scaledBias;
+	Vec4V biasedErr;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactBatchPointBase4) == 96);
+
+/**
+\brief Contains the additional data required to represent 4 contacts between 2 dynamic bodies
+@see SolverContactBatchPointBase4
+*/
+struct SolverContactBatchPointDynamic4 : public SolverContactBatchPointBase4
+{	
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+}; 
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactBatchPointDynamic4) == 144);
+
+/**
+\brief This represents the shared information of a batch of 4 friction constraints
+*/
+struct SolverFrictionSharedData4
+{
+	BoolV broken;
+	PxU8* frictionBrokenWritebackByte[4];
+	Vec4V normalX[2];
+	Vec4V normalY[2];
+	Vec4V normalZ[2];
+};
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFrictionSharedData4) == 128);
+#endif
+
+
+/**
+\brief This represents a batch of 4 friction constraints with static rolled into a single structure
+*/
+struct SolverContactFrictionBase4
+{
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V scaledBias;
+	Vec4V velMultiplier;
+	Vec4V targetVelocity;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFrictionBase4) == 96);
+
+/**
+\brief Contains the additional data required to represent 4 friction constraints between 2 dynamic bodies
+@see SolverContactFrictionBase4
+*/
+struct SolverContactFrictionDynamic4 : public SolverContactFrictionBase4
+{
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+}; 
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFrictionDynamic4) == 144);
+
+}
+
+}
+
+#endif //DY_SOLVERCONTACT4_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContactPF.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContactPF.h
new file mode 100644
index 00000000..e18421e9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContactPF.h
@@ -0,0 +1,123 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_SOLVERCONTACTPF_H
+#define DY_SOLVERCONTACTPF_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+using namespace Ps::aos;
+
+namespace Dy
+{
+
+struct SolverContactCoulombHeader
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU16	frictionOffset;					//4
+	//PxF32	restitution;
+	PxF32	angDom0;						//8
+	PxF32	dominance0;						//12
+	PxF32	dominance1;						//16
+	PX_ALIGN(16, PxVec3	normalXYZ);			//28
+	PxF32	angDom1;						//32
+	
+	Sc::ShapeInteraction* shapeInteraction;		//36	40
+	PxU8	flags;								//37	41
+	PxU8	pad0[3];							//40	44
+#if !PX_P64_FAMILY
+	PxU32	pad1[2];							//48	
+#else
+	PxU32 pad1;									//		48
+#endif
+	
+	
+	
+	PX_FORCE_INLINE void setDominance0(const FloatV f)		{FStore(f, &dominance0);}
+	PX_FORCE_INLINE void setDominance1(const FloatV f)		{FStore(f, &dominance1);}
+	PX_FORCE_INLINE void setNormal(const Vec3V n)			{V3StoreA(n, normalXYZ);}
+	
+	PX_FORCE_INLINE FloatV getDominance0() const			{return FLoad(dominance0);}
+	PX_FORCE_INLINE FloatV getDominance1() const			{return FLoad(dominance1);}
+	//PX_FORCE_INLINE FloatV getRestitution() const			{return FLoad(restitution);}
+	PX_FORCE_INLINE	Vec3V getNormal()const					{return V3LoadA(normalXYZ);}
+
+  
+	PX_FORCE_INLINE void setDominance0(PxF32 f)				{ dominance0 = f; }
+	PX_FORCE_INLINE void setDominance1(PxF32 f)				{ dominance1 = f;}
+	//PX_FORCE_INLINE void setRestitution(PxF32 f)			{ restitution = f;}
+
+	PX_FORCE_INLINE PxF32 getDominance0PxF32() const		{return dominance0;}
+	PX_FORCE_INLINE PxF32 getDominance1PxF32() const		{return dominance1;}
+	//PX_FORCE_INLINE PxF32 getRestitutionPxF32() const		{return restitution;}
+
+}; 
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactCoulombHeader) == 48);
+
+struct SolverFrictionHeader
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;
+	PxU8	flags;
+	PxF32   staticFriction;
+	PxF32   invMass0D0;
+	PxF32	invMass1D1;
+	PxF32	angDom0;
+	PxF32	angDom1;
+	PxU32	pad2[2];
+
+	PX_FORCE_INLINE void setStaticFriction(const FloatV f)	{FStore(f, &staticFriction);}
+	
+	PX_FORCE_INLINE FloatV getStaticFriction() const		{return FLoad(staticFriction);}
+	
+	PX_FORCE_INLINE void setStaticFriction(PxF32 f)			{staticFriction = f;}
+
+	PX_FORCE_INLINE PxF32 getStaticFrictionPxF32() const	{return staticFriction;}	
+
+	PX_FORCE_INLINE PxU32 getAppliedForcePaddingSize() const {return sizeof(PxU32)*((4 * ((numNormalConstr + 3)/4)));}
+	static PX_FORCE_INLINE PxU32 getAppliedForcePaddingSize(const PxU32 numConstr) {return sizeof(PxU32)*((4 * ((numConstr + 3)/4)));}
+}; 
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFrictionHeader) == 32);
+
+}
+
+}
+
+#endif //DY_SOLVERCONTACTPF_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContactPF4.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContactPF4.h
new file mode 100644
index 00000000..7cf3b94d
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContactPF4.h
@@ -0,0 +1,155 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVER_CONTACT_PF_4_H
+#define DY_SOLVER_CONTACT_PF_4_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+using namespace Ps::aos;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+
+namespace Dy
+{
+
+struct SolverContactCoulombHeader4
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU16	frictionOffset;
+	PxU8	numNormalConstr0, numNormalConstr1, numNormalConstr2, numNormalConstr3;
+	PxU8	flags[4];
+	PxU32	pad;					//16
+	Vec4V	restitution;			//32
+	Vec4V	normalX;				//48
+	Vec4V	normalY;				//64
+	Vec4V	normalZ;				//80
+	Vec4V	invMassADom;			//96
+	Vec4V	invMassBDom;			//112
+	Vec4V	angD0;					//128
+	Vec4V	angD1;					//144
+	Sc::ShapeInteraction* shapeInteraction[4];		//160	or 176
+}; 
+
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactCoulombHeader4) == 160);
+#else
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactCoulombHeader4) == 176);
+#endif
+
+struct SolverContact4Base
+{
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V appliedForce;
+	Vec4V velMultiplier;
+	Vec4V targetVelocity;
+	Vec4V scaledBias;
+	Vec4V maxImpulse;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContact4Base) == 128);
+
+struct SolverContact4Dynamic : public SolverContact4Base
+{
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContact4Dynamic) == 176);
+
+struct SolverFrictionHeader4
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;
+	PxU8	numNormalConstr0;
+	PxU8	numNormalConstr1;
+	PxU8	numNormalConstr2;
+	PxU8	numNormalConstr3;
+	PxU8	numFrictionConstr0;
+	PxU8	numFrictionConstr1;
+	PxU8	numFrictionConstr2;
+	PxU8	numFrictionConstr3;
+	PxU8	pad0;
+	PxU32	frictionPerContact;
+
+	Vec4V	staticFriction;
+	Vec4V   invMassADom;
+	Vec4V   invMassBDom;
+	Vec4V	angD0;
+	Vec4V	angD1;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFrictionHeader4) == 96);
+
+struct SolverFriction4Base
+{
+	Vec4V normalX;
+	Vec4V normalY;
+	Vec4V normalZ;
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V appliedForce;
+	Vec4V velMultiplier;
+	Vec4V targetVelocity;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFriction4Base) == 144);
+
+struct SolverFriction4Dynamic : public SolverFriction4Base
+{
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFriction4Dynamic) == 192);
+
+}
+
+}
+
+
+
+#endif //DY_SOLVER_CONTACT_PF_4_H
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContext.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContext.h
new file mode 100644
index 00000000..df3d7fea
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverContext.h
@@ -0,0 +1,64 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONTEXT_H
+#define DY_SOLVERCONTEXT_H
+
+namespace physx
+{
+	struct PxSolverBodyData;
+
+namespace Dy
+{
+	struct ThresholdStreamElement;
+	
+
+struct SolverContext
+{
+	bool doFriction;
+	bool writeBackIteration;
+
+	// for threshold stream output
+	ThresholdStreamElement*				mThresholdStream;
+	PxU32								mThresholdStreamIndex;
+	PxU32								mThresholdStreamLength;
+	PxSolverBodyData*						solverBodyArray;
+
+	ThresholdStreamElement* PX_RESTRICT mSharedThresholdStream;
+	PxU32 mSharedThresholdStreamLength;
+	PxI32* mSharedOutThresholdPairs;
+
+};
+
+}
+
+}
+
+#endif //DY_SOLVERCONTEXT_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControl.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControl.cpp
new file mode 100644
index 00000000..688e0b81
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControl.cpp
@@ -0,0 +1,622 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxPreprocessor.h"
+
+#include "PsAllocator.h"
+#include <new>
+#include <stdio.h>
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DyThresholdTable.h"
+#include "DySolverControl.h"
+#include "DyArticulationHelper.h"
+#include "PsAtomic.h"
+#include "PsIntrinsics.h"
+#include "DyArticulationPImpl.h"
+#include "PsThread.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverContext.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+//-----------------------------------
+
+void solve1DBlock					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContact_BStaticBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_Static	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4_Block					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solve1DConcludeBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactConcludeBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContact_BStaticConcludeBlock	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_Conclude		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_Conclude				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solve1DBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContact_BStaticBlockWriteBack	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_WriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void writeBack1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void contactBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void extContactBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void ext1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void contactPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void writeBack1D4Block				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+// could move this to PxPreprocessor.h but 
+// no implementation available for MSVC
+#if PX_GCC_FAMILY
+#define PX_UNUSED_ATTRIBUTE __attribute__((unused))
+#else
+#define PX_UNUSED_ATTRIBUTE 
+#endif
+ 
+#define DYNAMIC_ARTICULATION_REGISTRATION(x) 0
+
+static SolveBlockMethod gVTableSolveBlock[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactBlock,														// DY_SC_TYPE_RB_CONTACT
+	solve1DBlock,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactBlock),				// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlock),						// DY_SC_TYPE_EXT_1D
+	solveContact_BStaticBlock,												// DY_SC_TYPE_STATIC_CONTACT
+	solveContactBlock,														// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactPreBlock,													// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactPreBlock_Static,											// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4_Block,															// DY_SC_TYPE_BLOCK_1D,
+};
+
+static SolveWriteBackBlockMethod gVTableSolveWriteBackBlock[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactBlockWriteBack,												// DY_SC_TYPE_RB_CONTACT
+	solve1DBlockWriteBack,													// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactBlockWriteBack),		// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlockWriteBack),			// DY_SC_TYPE_EXT_1D
+	solveContact_BStaticBlockWriteBack,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactBlockWriteBack,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactPreBlock_WriteBack,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactPreBlock_WriteBackStatic,									// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_WriteBack,												// DY_SC_TYPE_BLOCK_1D,
+};
+
+static SolveBlockMethod gVTableSolveConcludeBlock[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactConcludeBlock,												// DY_SC_TYPE_RB_CONTACT
+	solve1DConcludeBlock,													// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactConcludeBlock),		// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DConcludeBlock),				// DY_SC_TYPE_EXT_1D
+	solveContact_BStaticConcludeBlock,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactConcludeBlock,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactPreBlock_Conclude,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactPreBlock_ConcludeStatic,									// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_Conclude,													// DY_SC_TYPE_BLOCK_1D,
+};
+
+void SolverCoreRegisterArticulationFns()
+{
+	gVTableSolveBlock[DY_SC_TYPE_EXT_CONTACT] = solveExtContactBlock;
+	gVTableSolveBlock[DY_SC_TYPE_EXT_1D] = solveExt1DBlock;
+
+	gVTableSolveWriteBackBlock[DY_SC_TYPE_EXT_CONTACT] = solveExtContactBlockWriteBack;
+	gVTableSolveWriteBackBlock[DY_SC_TYPE_EXT_1D] = solveExt1DBlockWriteBack;
+	gVTableSolveConcludeBlock[DY_SC_TYPE_EXT_CONTACT] = solveExtContactConcludeBlock;
+	gVTableSolveConcludeBlock[DY_SC_TYPE_EXT_1D] = solveExt1DConcludeBlock;
+}
+
+
+SolveBlockMethod* getSolveBlockTable()
+{
+	return gVTableSolveBlock;
+}
+
+SolveBlockMethod* getSolverConcludeBlockTable()
+{
+	return gVTableSolveConcludeBlock;
+}
+
+SolveWriteBackBlockMethod* getSolveWritebackBlockTable()
+{
+	return gVTableSolveWriteBackBlock;
+}
+
+
+
+
+SolverCoreGeneral* SolverCoreGeneral::create()
+{
+	SolverCoreGeneral* scg = reinterpret_cast<SolverCoreGeneral*>(
+		PX_ALLOC(sizeof(SolverCoreGeneral), "SolverCoreGeneral"));
+
+	if(scg)
+		new (scg) SolverCoreGeneral;
+
+	return scg;
+}
+
+void SolverCoreGeneral::destroyV()
+{
+	this->~SolverCoreGeneral();
+	PX_FREE(this);
+}
+
+void SolverCoreGeneral::solveV_Blocks(SolverIslandParams& params) const
+{
+
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+	SolverContext cache;
+	cache.solverBodyArray			= params.bodyDataList;
+	cache.mThresholdStream			= tempThresholdStream;
+	cache.mThresholdStreamLength	= TempThresholdStreamSize;
+	cache.mThresholdStreamIndex		= 0;
+	cache.writeBackIteration		= false;
+
+	PxI32 batchCount = PxI32(params.numConstraintHeaders);
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+	const PxU32 bodyListSize = params.bodyListSize;
+
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+	const PxU32 velocityIterations = params.velocityIterations;
+	const PxU32 positionIterations = params.positionIterations;
+
+	const PxU32 numConstraintHeaders = params.numConstraintHeaders;
+	const PxU32 articulationListSize = params.articulationListSize;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	if(numConstraintHeaders == 0)
+	{
+		for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+		{
+			Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+			PxSolverBody& atom = bodyListStart[baIdx];
+
+			motionVel.linear = atom.linearVelocity;
+			motionVel.angular = atom.angularState;
+		}
+
+		for (PxU32 i = 0; i < articulationListSize; i++)
+			ArticulationPImpl::saveVelocity(articulationListStart[i]);
+
+		return;
+	}
+
+	BatchIterator contactIterator(params.constraintBatchHeaders, params.numConstraintHeaders);
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+
+	//0-(n-1) iterations
+	PxI32 normalIter = 0;
+	PxI32 frictionIter = 0;
+
+	for (PxU32 iteration = positionIterations; iteration > 0; iteration--)	//decreasing positive numbers == position iters
+	{
+		cache.doFriction = iteration<=3;
+
+		SolveBlockParallel<false>(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, iteration == 1 ? gVTableSolveConcludeBlock : gVTableSolveBlock, normalIter, frictionIter, normalIter);
+
+		++normalIter;
+	}
+
+	for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+	{
+		const PxSolverBody& atom = bodyListStart[baIdx];
+		Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+		motionVel.linear = atom.linearVelocity;
+		motionVel.angular = atom.angularState;
+	}
+	
+
+	for (PxU32 i = 0; i < articulationListSize; i++)
+		ArticulationPImpl::saveVelocity(articulationListStart[i]);
+
+
+	const PxI32 velItersMinOne = (PxI32(velocityIterations)) - 1;
+
+	PxI32 iteration = 0;
+
+	for(; iteration < velItersMinOne; ++iteration)
+	{	
+
+		SolveBlockParallel<false>(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveBlock, normalIter, frictionIter, normalIter);
+		++normalIter;
+
+	}
+
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	PxU32 thresholdStreamLength = params.thresholdStreamLength;
+
+	cache.writeBackIteration = true;
+	cache.mSharedThresholdStream = thresholdStream;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	for(; iteration < PxI32(velocityIterations); ++iteration)
+	{
+
+		SolveBlockParallel<false>(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveWriteBackBlock, normalIter, frictionIter, normalIter);
+		++normalIter;
+
+	}	
+
+	//Write back remaining threshold streams
+	if(cache.mThresholdStreamIndex > 0)
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(outThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+		{
+			thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+PxI32 SolverCoreGeneral::solveVParallelAndWriteBack
+(SolverIslandParams& params) const
+{
+#if PX_PROFILE_SOLVE_STALLS
+	PxU64 startTime = readTimer();
+
+	PxU64 stallCount = 0;
+#endif
+
+	SolverContext cache;
+	cache.solverBodyArray = params.bodyDataList;
+	const PxU32 batchSize = params.batchSize;
+
+	const PxI32 UnrollCount = PxI32(batchSize);
+	const PxI32 SaveUnrollCount = 32;
+
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+	const PxI32 bodyListSize = PxI32(params.bodyListSize);
+	const PxI32 articulationListSize = PxI32(params.articulationListSize);
+
+
+	const PxI32 batchCount = PxI32(params.numConstraintHeaders);
+	cache.mThresholdStream = tempThresholdStream;
+	cache.mThresholdStreamLength = TempThresholdStreamSize;
+	cache.mThresholdStreamIndex = 0;
+	cache.writeBackIteration = false;
+
+	const PxI32 positionIterations = PxI32(params.positionIterations);
+	const PxI32 velocityIterations = PxI32(params.velocityIterations);
+
+	PxI32* constraintIndex = &params.constraintIndex;
+	PxI32* constraintIndex2 = &params.constraintIndex2;
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+
+	const PxU32 nbPartitions = params.nbPartitions;	
+
+	PxU32* headersPerPartition = params.headersPerPartition;
+
+	PX_UNUSED(velocityIterations);
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	PxI32 endIndexCount = UnrollCount;
+	PxI32 index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+	
+	BatchIterator contactIter(params.constraintBatchHeaders, params.numConstraintHeaders);
+
+	PxI32 maxNormalIndex = 0;
+	PxI32 normalIteration = 0;
+	PxI32 frictionIteration = 0;
+	PxU32 a = 0;
+	PxI32 targetConstraintIndex = 0;
+	for(PxU32 i = 0; i < 2; ++i)
+	{
+		SolveBlockMethod* solveTable = i == 0 ? gVTableSolveBlock : gVTableSolveConcludeBlock;
+		for(; a < positionIterations - 1 + i; ++a)
+		{
+			cache.doFriction = (positionIterations - a) <= 3;
+			for(PxU32 b = 0; b < nbPartitions; ++b)
+			{
+				WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+				maxNormalIndex += headersPerPartition[b];
+				
+				PxI32 nbSolved = 0;
+				while(index < maxNormalIndex)
+				{
+					const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+					SolveBlockParallel<false>(constraintList, remainder, index, batchCount, cache, contactIter, solveTable, 
+						normalIteration, frictionIteration, normalIteration);
+					index += remainder;
+					endIndexCount -= remainder;
+					nbSolved += remainder;
+					if(endIndexCount == 0)
+					{
+						endIndexCount = UnrollCount;
+						index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+					}
+				}
+				if(nbSolved)
+				{
+					Ps::memoryBarrier();
+					physx::shdfnd::atomicAdd(constraintIndex2, nbSolved);
+				}
+				targetConstraintIndex += headersPerPartition[b]; //Increment target constraint index by batch count
+			}
+			++normalIteration;
+		}
+	}
+
+	PxI32* bodyListIndex = &params.bodyListIndex;
+	PxI32* bodyListIndex2 = &params.bodyListIndex2;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+
+	//Save velocity - articulated
+	PxI32 endIndexCount2 = SaveUnrollCount;
+	PxI32 index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+	{
+		WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+		PxI32 nbConcluded = 0;
+		while(index2 < articulationListSize)
+		{
+			const PxI32 remainder = PxMin(SaveUnrollCount, (articulationListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				ArticulationPImpl::saveVelocity(articulationListStart[index2]);
+			}
+			if(endIndexCount2 == 0)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+				endIndexCount2 = SaveUnrollCount;
+			}
+			nbConcluded += remainder;
+		}
+
+		index2 -= articulationListSize;
+
+		//save velocity
+		
+
+		while(index2 < bodyListSize)
+		{
+			const PxI32 remainder = PxMin(endIndexCount2, (bodyListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				Ps::prefetchLine(&bodyListStart[index2 + 8]);
+				Ps::prefetchLine(&motionVelocityArray[index2 + 8]);
+				PxSolverBody& body = bodyListStart[index2];
+				Cm::SpatialVector& motionVel = motionVelocityArray[index2];
+				motionVel.linear = body.linearVelocity;
+				motionVel.angular = body.angularState;
+				PX_ASSERT(motionVel.linear.isFinite());
+				PX_ASSERT(motionVel.angular.isFinite());
+			}
+
+			nbConcluded += remainder;
+			
+			//Branch not required because this is the last time we use this atomic variable
+			//if(index2 < articulationListSizePlusbodyListSize)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount - articulationListSize;
+				endIndexCount2 = SaveUnrollCount;
+			}
+		}
+
+		if(nbConcluded)
+		{
+			Ps::memoryBarrier();
+			physx::shdfnd::atomicAdd(bodyListIndex2, nbConcluded);
+		}
+	}
+
+
+	WAIT_FOR_PROGRESS(bodyListIndex2, (bodyListSize + articulationListSize));
+
+	a = 1;
+	for(; a < params.velocityIterations; ++a)
+	{
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+			maxNormalIndex += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+				SolveBlockParallel<false>(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveBlock, 
+					normalIteration, 0, normalIteration);
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				physx::shdfnd::atomicAdd(constraintIndex2, nbSolved);
+			}
+			targetConstraintIndex += headersPerPartition[b]; //Increment target constraint index by batch count
+		}
+		++normalIteration;
+	}
+
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	PxU32 thresholdStreamLength = params.thresholdStreamLength;
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	cache.mSharedThresholdStream = thresholdStream;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+
+	//Last iteration - do writeback as well!
+	cache.writeBackIteration = true;
+	{
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+			maxNormalIndex += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+
+				SolveBlockParallel<false>(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveWriteBackBlock, 
+					normalIteration, 0, normalIteration);
+
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				physx::shdfnd::atomicAdd(constraintIndex2, nbSolved);
+			}
+			targetConstraintIndex += headersPerPartition[b]; //Increment target constraint index by batch count
+		}
+
+		if(cache.mThresholdStreamIndex > 0)
+		{
+			//Write back to global buffer
+			PxI32 threshIndex = physx::shdfnd::atomicAdd(outThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+			for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+			{
+				thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+			}
+			cache.mThresholdStreamIndex = 0;
+		}
+
+		++normalIteration;
+
+	}
+
+#if PX_PROFILE_SOLVE_STALLS
+
+	
+	PxU64 endTime = readTimer();
+	PxReal totalTime = (PxReal)(endTime - startTime);
+	PxReal stallTime = (PxReal)stallCount;
+	PxReal stallRatio = stallTime/totalTime;
+	if(0)//stallRatio > 0.2f)
+	{
+		LARGE_INTEGER frequency;
+		QueryPerformanceFrequency( &frequency );
+		printf("Warning -- percentage time stalled = %f; stalled for %f seconds; total Time took %f seconds\n", 
+			stallRatio * 100.f, stallTime/(PxReal)frequency.QuadPart, totalTime/(PxReal)frequency.QuadPart);
+	}
+#endif
+
+	return normalIteration * batchCount;
+
+}
+
+
+void SolverCoreGeneral::writeBackV
+(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 /*constraintListSize*/, PxConstraintBatchHeader* batchHeaders, const PxU32 numBatches,
+ ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+ PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const
+{
+	SolverContext cache;
+	cache.solverBodyArray			= atomListData;
+	cache.mThresholdStream			= thresholdStream;
+	cache.mThresholdStreamLength	= thresholdStreamLength;
+	cache.mThresholdStreamIndex		= 0;
+
+	PxI32 outThreshIndex = 0;
+	for(PxU32 j = 0; j < numBatches; ++j)
+	{
+		PxU8 type = *constraintList[batchHeaders[j].mStartIndex].constraint;
+		writeBackTable[type](constraintList + batchHeaders[j].mStartIndex,
+			batchHeaders[j].mStride, cache);
+	}
+
+	outThresholdPairs = PxU32(outThreshIndex);
+}
+
+void solveVBlock(SOLVEV_BLOCK_METHOD_ARGS)
+{
+	solverCore->solveV_Blocks(params);
+}
+
+}
+}
+
+
+//#endif
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControl.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControl.h
new file mode 100644
index 00000000..bfccb2b6
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControl.h
@@ -0,0 +1,218 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCOREGENERAL_H
+#define DY_SOLVERCOREGENERAL_H
+
+#include "DySolverCore.h"
+#include "DySolverConstraintDesc.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+struct FsData;
+
+inline void BusyWaitState(volatile PxU32* state, const PxU32 requiredState)
+{
+	while(requiredState != *state );
+}
+
+inline void WaitBodyRequiredState(PxU32* state, const PxU32 requiredState)
+{
+	if(*state != requiredState)
+	{
+		BusyWaitState(state, requiredState);
+	}
+}
+
+inline void BusyWaitStates(volatile PxU32* stateA, volatile PxU32* stateB, const PxU32 requiredStateA, const PxU32 requiredStateB)
+{
+	while(*stateA != requiredStateA);
+	while(*stateB != requiredStateB);
+}
+
+
+PX_FORCE_INLINE void WaitBodyABodyBRequiredState(const PxSolverConstraintDesc& desc, const PxI32 iterationA, const PxI32 iterationB)
+{
+	PxSolverBody* PX_RESTRICT pBodyA = desc.bodyA;
+	PxSolverBody* PX_RESTRICT pBodyB = desc.bodyB;
+
+	const PxU32 requiredProgressA=(desc.bodyASolverProgress == 0xFFFF) ? 0xFFFF : PxU32(desc.bodyASolverProgress + iterationA * pBodyA->maxSolverNormalProgress + iterationB * pBodyA->maxSolverFrictionProgress);
+	const PxU32 requiredProgressB=(desc.bodyBSolverProgress == 0xFFFF) ? 0xFFFF : PxU32(desc.bodyBSolverProgress + iterationA * pBodyB->maxSolverNormalProgress + iterationB * pBodyB->maxSolverFrictionProgress);
+	PX_ASSERT(requiredProgressA!=0xFFFFFFFF || requiredProgressB!=0xFFFFFFFF);
+
+	const PxU32 solverProgressA = pBodyA->solverProgress;
+	const PxU32 solverProgressB = pBodyB->solverProgress;	
+
+	if(solverProgressA != requiredProgressA || solverProgressB != requiredProgressB)
+	{
+		BusyWaitStates(&pBodyA->solverProgress, &pBodyB->solverProgress, requiredProgressA, requiredProgressB);
+	}	
+}
+
+PX_FORCE_INLINE void IncrementBodyProgress(const PxSolverConstraintDesc& desc)
+{
+	PxSolverBody* PX_RESTRICT pBodyA = desc.bodyA;
+	PxSolverBody* PX_RESTRICT pBodyB = desc.bodyB;
+
+	const PxU32 maxProgressA = pBodyA->maxSolverNormalProgress;
+	const PxU32 maxProgressB = pBodyB->maxSolverNormalProgress;
+
+	//NB - this approach removes the need for an imul (which is a non-pipeline instruction on PPC chips)
+	const PxU32 requiredProgressA=(maxProgressA == 0xFFFF) ? 0xFFFF : pBodyA->solverProgress + 1;
+	const PxU32 requiredProgressB=(maxProgressB == 0xFFFF) ? 0xFFFF : pBodyB->solverProgress + 1;
+
+	volatile PxU32* solveProgressA = &pBodyA->solverProgress;
+	volatile PxU32* solveProgressB = &pBodyB->solverProgress;
+
+	*solveProgressA=requiredProgressA;
+	*solveProgressB=requiredProgressB;
+
+}
+
+
+class BatchIterator
+{
+public:
+	PxConstraintBatchHeader* constraintBatchHeaders;
+	PxU32 mSize;
+	PxU32 mCurrentIndex;
+
+	BatchIterator(PxConstraintBatchHeader* _constraintBatchHeaders, PxU32 size) : constraintBatchHeaders(_constraintBatchHeaders),
+		mSize(size), mCurrentIndex(0)
+	{
+	}
+
+	PX_FORCE_INLINE const PxConstraintBatchHeader& GetCurrentHeader(const PxU32 constraintIndex)
+	{
+		PxU32 currentIndex = mCurrentIndex;
+		while((constraintIndex - constraintBatchHeaders[currentIndex].mStartIndex) >= constraintBatchHeaders[currentIndex].mStride)
+			currentIndex = (currentIndex + 1)%mSize;
+		Ps::prefetchLine(&constraintBatchHeaders[currentIndex], 128);
+		mCurrentIndex = currentIndex;
+		return constraintBatchHeaders[currentIndex];
+	}
+private:
+	BatchIterator& operator=(const BatchIterator&);
+};
+
+
+template<bool bWaitIncrement>
+void SolveBlockParallel	(PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxI32 batchCount, const PxI32 index,  
+						 const PxI32 headerCount, SolverContext& cache, BatchIterator& iterator,
+						 SolveBlockMethod solveTable[], const PxI32 normalIteration, const PxI32 frictionIteration,
+						 const PxI32 iteration
+						)
+{
+	const PxI32 indA = index - (iteration * headerCount);
+
+	const PxConstraintBatchHeader* PX_RESTRICT headers = iterator.constraintBatchHeaders;
+
+	const PxI32 endIndex = indA + batchCount;
+	for(PxI32 i = indA; i < endIndex; ++i)
+	{
+		const PxConstraintBatchHeader& header = headers[i];
+
+		const PxI32 numToGrab = header.mStride;
+		PxSolverConstraintDesc* PX_RESTRICT block = &constraintList[header.mStartIndex];
+
+		Ps::prefetch(block[0].constraint, 384);
+
+		for(PxI32 b = 0; b < numToGrab; ++b)
+		{
+			Ps::prefetchLine(block[b].bodyA);
+			Ps::prefetchLine(block[b].bodyB);
+			if(bWaitIncrement)
+				WaitBodyABodyBRequiredState(block[b], normalIteration, frictionIteration);
+		}
+
+		//OK. We have a number of constraints to run...
+		solveTable[header.mConstraintType](block, PxU32(numToGrab), cache);
+
+		//Increment body progresses
+		if(bWaitIncrement)
+		{
+			Ps::memoryBarrier();
+			for(PxI32 j = 0; j < numToGrab; ++j)
+			{
+				IncrementBodyProgress(block[j]);	
+			}
+		}
+	}
+}
+
+
+
+
+class SolverCoreGeneral : public SolverCore
+{
+public:
+	static SolverCoreGeneral* create();
+
+	// Implements SolverCore
+	virtual void destroyV();
+
+	virtual PxI32 solveVParallelAndWriteBack
+		(SolverIslandParams& params) const;
+
+	virtual void solveV_Blocks
+		(SolverIslandParams& params) const;
+
+	virtual void writeBackV
+		(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 constraintListSize, PxConstraintBatchHeader* contactConstraintBatches, const PxU32 numBatches,
+		 ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+		 PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const;
+
+private:
+
+	//~Implements SolverCore
+};
+
+#define SOLVEV_BLOCK_METHOD_ARGS											\
+	SolverCore*	solverCore,												\
+	SolverIslandParams& params
+
+void solveVBlock(SOLVEV_BLOCK_METHOD_ARGS);
+
+SolveBlockMethod* getSolveBlockTable();
+
+SolveBlockMethod* getSolverConcludeBlockTable();
+
+SolveWriteBackBlockMethod* getSolveWritebackBlockTable();
+
+
+}
+
+}
+
+#endif //DY_SOLVERCOREGENERAL_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControlPF.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControlPF.cpp
new file mode 100644
index 00000000..1858da15
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControlPF.cpp
@@ -0,0 +1,755 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxPreprocessor.h"
+#include "PsAllocator.h"
+#include <new>
+#include <stdio.h>
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DyThresholdTable.h"
+#include "DySolverControl.h"
+#include "DyArticulationHelper.h"
+#include "PsAtomic.h"
+#include "PsIntrinsics.h"
+#include "DyArticulationPImpl.h"
+#include "PsThread.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverContext.h"
+#include "DySolverControlPF.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+//-----------------------------------
+
+void solve1DBlock					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4_Block					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solve1DConcludeBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_Conclude				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solve1DBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_WriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void writeBack1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void ext1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void writeBack1D4Block				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solveFrictionBlock					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFriction_BStaticBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtFrictionBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactCoulombBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulomb_BStaticBlock	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solveContactCoulombConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactCoulombConcludeBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulomb_BStaticConcludeBlock	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveContactCoulombBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactCoulombBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulomb_BStaticBlockWriteBack	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFriction_BStaticBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtFrictionBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+//Pre-block 1d/2d friction stuff...
+
+void solveContactCoulombPreBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_Static			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_Conclude		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_ConcludeStatic	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionCoulombPreBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveFrictionCoulombPreBlock_Static		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionCoulombPreBlock_Conclude		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionCoulombPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveFrictionCoulombPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveFrictionCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+// could move this to PxPreprocessor.h but 
+// no implementation available for MSVC
+#if PX_GCC_FAMILY
+#define PX_UNUSED_ATTRIBUTE __attribute__((unused))
+#else
+#define PX_UNUSED_ATTRIBUTE 
+#endif
+ 
+#define DYNAMIC_ARTICULATION_REGISTRATION(x) 0
+
+
+static SolveBlockMethod gVTableSolveBlockCoulomb[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactCoulombBlock,												// DY_SC_TYPE_RB_CONTACT
+	solve1DBlock,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactCoulombBlock),			// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlock),						// DY_SC_TYPE_EXT_1D
+	solveContactCoulomb_BStaticBlock,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactCoulombBlock,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactCoulombPreBlock,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactCoulombPreBlock_Static,										// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4_Block,															// DY_SC_TYPE_BLOCK_1D,
+	solveFrictionBlock,														// DY_SC_TYPE_FRICTION_CONSTRAINT
+	solveFriction_BStaticBlock,												// DY_SC_TYPE_STATIC_FRICTION_CONSTRAINT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtFrictionBlock),				// DY_SC_TYPE_EXT_FRICTION_CONSTRAINT
+	solveFrictionCoulombPreBlock,											// DY_SC_TYPE_BLOCK_FRICTION					
+	solveFrictionCoulombPreBlock_Static										// DY_SC_TYPE_BLOCK_STATIC_FRICTION
+};
+
+static SolveWriteBackBlockMethod gVTableSolveWriteBackBlockCoulomb[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactCoulombBlockWriteBack,												// DY_SC_TYPE_RB_CONTACT
+	solve1DBlockWriteBack,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactCoulombBlockWriteBack),		// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlockWriteBack),					// DY_SC_TYPE_EXT_1D
+	solveContactCoulomb_BStaticBlockWriteBack,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactCoulombBlockWriteBack,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactCoulombPreBlock_WriteBack,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactCoulombPreBlock_WriteBackStatic,									// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_WriteBack,														// DY_SC_TYPE_BLOCK_1D,
+	solveFrictionBlockWriteBack,													// DY_SC_TYPE_FRICTION_CONSTRAINT
+	solveFriction_BStaticBlockWriteBack,											// DY_SC_TYPE_STATIC_FRICTION_CONSTRAINT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtFrictionBlockWriteBack),				// DY_SC_TYPE_EXT_FRICTION_CONSTRAINT
+	solveFrictionCoulombPreBlock_WriteBack,											// DY_SC_TYPE_BLOCK_FRICTION
+	solveFrictionCoulombPreBlock_WriteBackStatic									// DY_SC_TYPE_BLOCK_STATIC_FRICTION
+};
+
+
+static SolveBlockMethod gVTableSolveConcludeBlockCoulomb[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactCoulombConcludeBlock,												// DY_SC_TYPE_RB_CONTACT
+	solve1DConcludeBlock,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactCoulombConcludeBlock),			// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DConcludeBlock),						// DY_SC_TYPE_EXT_1D
+	solveContactCoulomb_BStaticConcludeBlock,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactCoulombConcludeBlock,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactCoulombPreBlock_Conclude,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactCoulombPreBlock_ConcludeStatic,										// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_Conclude,															// DY_SC_TYPE_BLOCK_1D,
+	solveFrictionBlock,																// DY_SC_TYPE_FRICTION_CONSTRAINT
+	solveFriction_BStaticBlock,														// DY_SC_TYPE_STATIC_FRICTION_CONSTRAINT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtFrictionBlock),						// DY_SC_TYPE_EXT_FRICTION_CONSTRAINT
+	solveFrictionCoulombPreBlock_Conclude,											// DY_SC_TYPE_BLOCK_FRICTION
+	solveFrictionCoulombPreBlock_ConcludeStatic										// DY_SC_TYPE_BLOCK_STATIC_FRICTION
+};
+
+
+void SolverCoreRegisterArticulationFnsCoulomb()
+{
+	gVTableSolveBlockCoulomb[DY_SC_TYPE_EXT_CONTACT] = solveExtContactCoulombBlock;
+	gVTableSolveBlockCoulomb[DY_SC_TYPE_EXT_1D] = solveExt1DBlock;
+
+	gVTableSolveWriteBackBlockCoulomb[DY_SC_TYPE_EXT_CONTACT] = solveExtContactCoulombBlockWriteBack;
+	gVTableSolveWriteBackBlockCoulomb[DY_SC_TYPE_EXT_1D] = solveExt1DBlockWriteBack;
+	gVTableSolveConcludeBlockCoulomb[DY_SC_TYPE_EXT_CONTACT] = solveExtContactCoulombConcludeBlock;
+	gVTableSolveConcludeBlockCoulomb[DY_SC_TYPE_EXT_1D] = solveExt1DConcludeBlock;
+
+	gVTableSolveBlockCoulomb[DY_SC_TYPE_EXT_FRICTION] = solveExtFrictionBlock;
+	gVTableSolveWriteBackBlockCoulomb[DY_SC_TYPE_EXT_FRICTION] = solveExtFrictionBlockWriteBack;
+	gVTableSolveConcludeBlockCoulomb[DY_SC_TYPE_EXT_FRICTION] = solveExtFrictionBlock;
+}
+
+SolverCoreGeneralPF* SolverCoreGeneralPF::create()
+{
+	SolverCoreGeneralPF* scg = reinterpret_cast<SolverCoreGeneralPF*>(
+		PX_ALLOC(sizeof(SolverCoreGeneralPF), "SolverCoreGeneral"));
+
+	if(scg)
+		new (scg) SolverCoreGeneralPF;
+
+	return scg;
+}
+
+void SolverCoreGeneralPF::destroyV()
+{
+	this->~SolverCoreGeneralPF();
+	PX_FREE(this);
+}
+
+void SolverCoreGeneralPF::solveV_Blocks(SolverIslandParams& params) const
+{
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+	SolverContext cache;
+	cache.solverBodyArray			= params.bodyDataList;
+	cache.mThresholdStream			= tempThresholdStream;
+	cache.mThresholdStreamLength	= TempThresholdStreamSize;
+	cache.mThresholdStreamIndex		= 0;
+	cache.writeBackIteration = false;
+
+	PxI32 batchCount = PxI32(params.numConstraintHeaders);
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+	const PxU32 bodyListSize = params.bodyListSize;
+
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+	const PxU32 velocityIterations = params.velocityIterations;
+	const PxU32 positionIterations = params.positionIterations;
+
+	const PxU32 numConstraintHeaders = params.numConstraintHeaders;
+	const PxU32 articulationListSize = params.articulationListSize;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	if(numConstraintHeaders == 0)
+	{
+		for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+		{
+			Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+			PxSolverBody& atom = bodyListStart[baIdx];
+			motionVel.linear = atom.linearVelocity;
+			motionVel.angular = atom.angularState;
+		}
+
+		for (PxU32 i = 0; i < articulationListSize; i++)
+			ArticulationPImpl::saveVelocity(articulationListStart[i]);
+
+		return;
+	}
+
+	BatchIterator contactIterator(params.constraintBatchHeaders, params.numConstraintHeaders);
+	BatchIterator frictionIterator(params.frictionConstraintBatches, params.numFrictionConstraintHeaders);
+
+
+	PxI32 frictionBatchCount = PxI32(params.numFrictionConstraintHeaders);
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+
+	PxSolverConstraintDesc* PX_RESTRICT frictionConstraintList = params.frictionConstraintList;
+
+
+	//0-(n-1) iterations
+	PxI32 normalIter = 0;
+	PxI32 frictionIter = 0;
+	for (PxU32 iteration = positionIterations; iteration > 0; iteration--)	//decreasing positive numbers == position iters
+	{
+
+		SolveBlockParallel<false>(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, iteration == 1 ? gVTableSolveConcludeBlockCoulomb : gVTableSolveBlockCoulomb, normalIter, frictionIter, normalIter);
+		++normalIter;
+	
+	}
+
+	if(frictionBatchCount>0)
+	{
+		const PxU32 numIterations = positionIterations * 2;
+		for (PxU32 iteration = numIterations; iteration > 0; iteration--)	//decreasing positive numbers == position iters
+		{
+			SolveBlockParallel<false>(frictionConstraintList, frictionBatchCount, frictionIter * frictionBatchCount, frictionBatchCount, 
+				cache, frictionIterator, iteration == 1 ? gVTableSolveConcludeBlockCoulomb : gVTableSolveBlockCoulomb, normalIter, frictionIter, frictionIter);
+			++frictionIter;
+		}
+	}
+
+	for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+	{
+		const PxSolverBody& atom = bodyListStart[baIdx];
+		Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+		motionVel.linear = atom.linearVelocity;
+		motionVel.angular = atom.angularState;
+	}
+	
+
+	for (PxU32 i = 0; i < articulationListSize; i++)
+		ArticulationPImpl::saveVelocity(articulationListStart[i]);
+
+
+	const PxU32 velItersMinOne = velocityIterations - 1;
+
+	PxU32 iteration = 0;
+
+	for(; iteration < velItersMinOne; ++iteration)
+	{	
+
+		SolveBlockParallel<false>(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveBlockCoulomb, normalIter, frictionIter, normalIter);
+		++normalIter;
+
+		if(frictionBatchCount > 0)
+		{
+			SolveBlockParallel<false>(frictionConstraintList, frictionBatchCount, frictionIter * frictionBatchCount, frictionBatchCount, 
+				cache, frictionIterator, gVTableSolveBlockCoulomb, normalIter, frictionIter, frictionIter);
+			++frictionIter;
+		}
+	}
+
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	PxU32 thresholdStreamLength = params.thresholdStreamLength;
+
+	cache.writeBackIteration = true;
+
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+	cache.mSharedThresholdStream = thresholdStream;
+
+	for(; iteration < velocityIterations; ++iteration)
+	{
+		SolveBlockParallel<false>(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveWriteBackBlockCoulomb, normalIter, frictionIter, normalIter);
+		++normalIter;
+
+		if(frictionBatchCount > 0)
+		{
+			SolveBlockParallel<false>(frictionConstraintList, frictionBatchCount, frictionIter * frictionBatchCount, frictionBatchCount, 
+				cache, frictionIterator, gVTableSolveWriteBackBlockCoulomb, normalIter, frictionIter, frictionIter);
+				++frictionIter;
+		}
+	}
+
+	//Write back remaining threshold streams
+	if(cache.mThresholdStreamIndex > 0)
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(reinterpret_cast<PxI32*>(&outThresholdPairs), PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+		{
+			thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+
+}
+
+PxI32 SolverCoreGeneralPF::solveVParallelAndWriteBack(SolverIslandParams& params) const
+{
+	SolverContext cache;
+	cache.solverBodyArray = params.bodyDataList;
+
+	const PxI32 UnrollCount = PxI32(params.batchSize);
+	const PxI32 SaveUnrollCount = 64;
+
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+
+	const PxI32 batchCount = PxI32(params.numConstraintHeaders);
+	const PxI32 frictionBatchCount = PxI32(params.numFrictionConstraintHeaders);//frictionConstraintBatches.size();
+	cache.mThresholdStream = tempThresholdStream;
+	cache.mThresholdStreamLength = TempThresholdStreamSize;
+	cache.mThresholdStreamIndex = 0;
+
+	const PxI32 positionIterations = PxI32(params.positionIterations);
+	const PxU32 velocityIterations = params.velocityIterations;
+
+	const PxI32 bodyListSize = PxI32(params.bodyListSize);
+	const PxI32 articulationListSize = PxI32(params.articulationListSize);
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	PxI32* constraintIndex = &params.constraintIndex;
+	PxI32* constraintIndex2 = &params.constraintIndex2;
+	PxI32* frictionConstraintIndex = &params.frictionConstraintIndex;
+
+	PxI32 endIndexCount = UnrollCount;
+	PxI32 index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+	PxI32 frictionIndex = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+	
+
+	BatchIterator contactIter(params.constraintBatchHeaders, params.numConstraintHeaders);
+	BatchIterator frictionIter(params.frictionConstraintBatches, params.numFrictionConstraintHeaders);
+
+	PxU32* headersPerPartition = params.headersPerPartition;
+	PxU32 nbPartitions = params.nbPartitions;
+
+	PxU32* frictionHeadersPerPartition = params.frictionHeadersPerPartition;
+	PxU32 nbFrictionPartitions = params.nbFrictionPartitions;
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+	PxSolverConstraintDesc* PX_RESTRICT frictionConstraintList = params.frictionConstraintList;
+
+
+	PxI32 maxNormalIndex = 0;
+	PxI32 maxProgress = 0;
+	PxI32 frictionEndIndexCount = UnrollCount;
+	PxI32 maxFrictionIndex = 0;
+
+	PxI32 normalIteration = 0;
+	PxI32 frictionIteration = 0;
+	PxU32 a = 0;
+	for(PxU32 i = 0; i < 2; ++i)
+	{
+		SolveBlockMethod* solveTable = i == 0 ? gVTableSolveBlockCoulomb : gVTableSolveConcludeBlockCoulomb;
+		for(; a < positionIterations - 1 + i; ++a)
+		{
+			for(PxU32 b = 0; b < nbPartitions; ++b)
+			{
+				WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+				maxNormalIndex += headersPerPartition[b];
+				maxProgress += headersPerPartition[b];
+				PxI32 nbSolved = 0;
+				while(index < maxNormalIndex)
+				{
+					const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+					SolveBlockParallel<false>(constraintList, remainder, index, batchCount, cache, contactIter, solveTable, 
+						normalIteration, frictionIteration, normalIteration);
+					index += remainder;
+					endIndexCount -= remainder;
+					nbSolved += remainder;
+					if(endIndexCount == 0)
+					{
+						endIndexCount = UnrollCount;
+						index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+					}
+				}
+				if(nbSolved)
+				{
+					Ps::memoryBarrier();
+					Ps::atomicAdd(constraintIndex2, nbSolved);
+				}
+			}
+			++normalIteration;
+		}
+
+	}
+
+
+	for(PxU32 i = 0; i < 2; ++i)
+	{
+		SolveBlockMethod* solveTable = i == 0 ? gVTableSolveBlockCoulomb : gVTableSolveConcludeBlockCoulomb;
+		const PxI32 numIterations = positionIterations *2;
+		for(; a <  numIterations - 1 + i; ++a)
+		{
+			for(PxU32 b = 0; b < nbFrictionPartitions; ++b)
+			{
+				WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+				maxProgress += frictionHeadersPerPartition[b];
+				maxFrictionIndex += frictionHeadersPerPartition[b];
+				PxI32 nbSolved = 0;
+				while(frictionIndex < maxFrictionIndex)
+				{
+					const PxI32 remainder = PxMin(maxFrictionIndex - frictionIndex, frictionEndIndexCount);
+					SolveBlockParallel<false>(frictionConstraintList, remainder, frictionIndex, frictionBatchCount, cache, frictionIter, 
+						solveTable, normalIteration, frictionIteration, frictionIteration);
+					frictionIndex += remainder;
+					frictionEndIndexCount -= remainder;
+					nbSolved += remainder;
+					if(frictionEndIndexCount == 0)
+					{
+						frictionEndIndexCount = UnrollCount;
+						frictionIndex  = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+					}
+				}
+				if(nbSolved)
+				{
+					Ps::memoryBarrier();
+					Ps::atomicAdd(constraintIndex2, nbSolved);
+				}
+			}
+			++frictionIteration;
+			
+		}
+
+	}
+
+	WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+
+	
+	PxI32* bodyListIndex = &params.bodyListIndex;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+	PxI32* bodyListIndex2 = &params.bodyListIndex2;
+
+	PxI32 endIndexCount2 = SaveUnrollCount;
+	PxI32 index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+	{
+		PxI32 nbConcluded = 0;
+		while(index2 < articulationListSize)
+		{
+			const PxI32 remainder = PxMin(SaveUnrollCount, (articulationListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				ArticulationPImpl::saveVelocity(articulationListStart[index2]);
+			}
+			nbConcluded += remainder;
+			if(endIndexCount2 == 0)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+				endIndexCount2 = SaveUnrollCount;
+			}
+			nbConcluded += remainder;
+		}
+
+		index2 -= articulationListSize;
+
+		//save velocity
+		
+
+		while(index2 < bodyListSize)
+		{
+			const PxI32 remainder = PxMin(endIndexCount2, (bodyListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				Ps::prefetchLine(&bodyListStart[index2 + 8]);
+				Ps::prefetchLine(&motionVelocityArray[index2 + 8]);
+				PxSolverBody& body = bodyListStart[index2];
+				Cm::SpatialVector& motionVel = motionVelocityArray[index2];
+				motionVel.linear = body.linearVelocity;
+				motionVel.angular = body.angularState;
+				PX_ASSERT(motionVel.linear.isFinite());
+				PX_ASSERT(motionVel.angular.isFinite());
+			}
+
+			nbConcluded += remainder;
+			
+			//Branch not required because this is the last time we use this atomic variable
+			//if(index2 < articulationListSizePlusbodyListSize)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount - articulationListSize;
+				endIndexCount2 = SaveUnrollCount;
+			}
+		}
+
+		if(nbConcluded)
+		{
+			Ps::memoryBarrier();
+			physx::shdfnd::atomicAdd(bodyListIndex2, nbConcluded);
+		}
+	}
+
+
+	WAIT_FOR_PROGRESS(bodyListIndex2, (bodyListSize + articulationListSize));
+
+	a = 0;
+	for(; a < velocityIterations-1; ++a)
+	{
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxNormalIndex += headersPerPartition[b];
+			maxProgress += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+				SolveBlockParallel<false>(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveBlockCoulomb, normalIteration, frictionIteration, normalIteration);
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+		++normalIteration;
+
+		for(PxU32 b = 0; b < nbFrictionPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxFrictionIndex += frictionHeadersPerPartition[b];
+			maxProgress += frictionHeadersPerPartition[b];
+
+			PxI32 nbSolved = 0;
+			while(frictionIndex < maxFrictionIndex)
+			{
+				const PxI32 remainder = PxMin(maxFrictionIndex - frictionIndex, frictionEndIndexCount);
+				SolveBlockParallel<false>(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveBlockCoulomb, 
+					normalIteration, frictionIteration, normalIteration);
+
+				frictionIndex += remainder;
+				frictionEndIndexCount -= remainder;
+				nbSolved += remainder;
+				if(frictionEndIndexCount == 0)
+				{
+					frictionEndIndexCount = UnrollCount;
+					frictionIndex  = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+
+		++frictionIteration;
+	}
+
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	const PxU32 thresholdStreamLength = params.thresholdStreamLength;
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+
+	cache.mSharedThresholdStream = thresholdStream;
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+
+	{
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxNormalIndex += headersPerPartition[b];
+			maxProgress += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+
+				SolveBlockParallel<false>(constraintList, remainder, normalIteration * batchCount, batchCount, 
+					cache, contactIter, gVTableSolveWriteBackBlockCoulomb, normalIteration, frictionIteration, normalIteration);
+
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+
+		++normalIteration;
+
+		cache.mSharedOutThresholdPairs = outThresholdPairs;
+		cache.mSharedThresholdStream = thresholdStream;
+		cache.mSharedThresholdStreamLength = thresholdStreamLength;
+
+		for(PxU32 b = 0; b < nbFrictionPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxFrictionIndex += frictionHeadersPerPartition[b];
+			maxProgress += frictionHeadersPerPartition[b];
+
+			PxI32 nbSolved = 0;
+			while(frictionIndex < maxFrictionIndex)
+			{
+				const PxI32 remainder = PxMin(maxFrictionIndex - frictionIndex, frictionEndIndexCount);
+
+				SolveBlockParallel<false>(frictionConstraintList, remainder, frictionIndex, frictionBatchCount, cache, frictionIter, 
+					gVTableSolveWriteBackBlockCoulomb, normalIteration, frictionIteration, frictionIteration);
+
+				frictionIndex += remainder;
+				frictionEndIndexCount -= remainder;
+				nbSolved += remainder;
+				if(frictionEndIndexCount == 0)
+				{
+					frictionEndIndexCount = UnrollCount;
+					frictionIndex  = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+
+		if(cache.mThresholdStreamIndex > 0)
+		{
+			//Write back to global buffer
+			PxI32 threshIndex = physx::shdfnd::atomicAdd(outThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+			for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+			{
+				thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+			}
+			cache.mThresholdStreamIndex = 0;
+		}
+
+		++frictionIteration;
+	}
+
+	return normalIteration * batchCount + frictionIteration * frictionBatchCount;
+}
+
+
+void SolverCoreGeneralPF::writeBackV
+(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 /*constraintListSize*/, PxConstraintBatchHeader* batchHeaders, const PxU32 numBatches,
+ ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+ PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const
+{
+	SolverContext cache;
+	cache.solverBodyArray			= atomListData;
+	cache.mThresholdStream			= thresholdStream;
+	cache.mThresholdStreamLength	= thresholdStreamLength;
+	cache.mThresholdStreamIndex		= 0;
+
+	PxI32 outThreshIndex = 0;
+	for(PxU32 j = 0; j < numBatches; ++j)
+	{
+		PxU8 type = *constraintList[batchHeaders[j].mStartIndex].constraint;
+		writeBackTable[type](constraintList + batchHeaders[j].mStartIndex,
+			batchHeaders[j].mStride, cache);
+	}
+
+	outThresholdPairs = PxU32(outThreshIndex);
+}
+
+}
+
+}
+
+
+//#endif
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControlPF.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControlPF.h
new file mode 100644
index 00000000..b8684cbb
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverControlPF.h
@@ -0,0 +1,71 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONTROLPF_H
+#define DY_SOLVERCONTROLPF_H
+
+#include "DySolverCore.h"
+#include "DySolverConstraintDesc.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+class SolverCoreGeneralPF : public SolverCore
+{
+public:
+	static SolverCoreGeneralPF* create();
+
+	// Implements SolverCore
+	virtual void destroyV();
+
+	virtual PxI32 solveVParallelAndWriteBack
+		(SolverIslandParams& params) const;
+
+	virtual void solveV_Blocks
+		(SolverIslandParams& params) const;
+
+	virtual void writeBackV
+		(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 constraintListSize, PxConstraintBatchHeader* contactConstraintBatches, const PxU32 numBatches,
+		 ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+		 PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const;
+
+private:
+
+	//~Implements SolverCore
+};
+
+}
+
+}
+
+#endif //DY_SOLVERCOREGENERALPF_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverCore.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverCore.h
new file mode 100644
index 00000000..a6f579f9
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverCore.h
@@ -0,0 +1,242 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCORE_H
+#define DY_SOLVERCORE_H
+
+#include "PxvConfig.h"
+#include "PsArray.h"
+#include "PsThread.h"
+
+
+namespace physx
+{
+
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+struct PxConstraintBatchHeader;
+
+namespace Dy
+{
+struct ThresholdStreamElement;
+	
+
+struct ArticulationSolverDesc;
+class Articulation;
+struct SolverContext;
+
+typedef void (*WriteBackMethod)(const PxSolverConstraintDesc& desc, SolverContext& cache, PxSolverBodyData& sbd0, PxSolverBodyData& sbd1);
+typedef void (*SolveMethod)(const PxSolverConstraintDesc& desc, SolverContext& cache);
+typedef void (*SolveBlockMethod)(const PxSolverConstraintDesc* desc, const PxU32 constraintCount, SolverContext& cache);
+typedef void (*SolveWriteBackBlockMethod)(const PxSolverConstraintDesc* desc, const PxU32 constraintCount, SolverContext& cache);
+typedef void (*WriteBackBlockMethod)(const PxSolverConstraintDesc* desc, const PxU32 constraintCount, SolverContext& cache);
+
+#define PX_PROFILE_SOLVE_STALLS 0
+#if PX_PROFILE_SOLVE_STALLS
+#if PX_WINDOWS
+#include <windows.h>
+
+
+PX_FORCE_INLINE PxU64 readTimer()
+{
+	//return __rdtsc();
+
+	LARGE_INTEGER i;
+	QueryPerformanceCounter(&i);
+	return i.QuadPart;
+}
+
+#endif
+#endif
+
+
+#define YIELD_THREADS 1
+
+#if YIELD_THREADS
+
+#define ATTEMPTS_BEFORE_BACKOFF 30000
+#define ATTEMPTS_BEFORE_RETEST 10000
+
+#endif
+
+PX_INLINE void WaitForProgressCount(volatile PxI32* pGlobalIndex, const PxI32 targetIndex)
+{
+#if YIELD_THREADS
+	if(*pGlobalIndex < targetIndex)
+	{
+		bool satisfied = false;
+		PxU32 count = ATTEMPTS_BEFORE_BACKOFF;
+		do
+		{
+			satisfied = true;
+			while(*pGlobalIndex < targetIndex)
+			{
+				if(--count == 0)
+				{
+					satisfied = false;
+					break;
+				}
+			}
+			if(!satisfied)
+				Ps::Thread::yield();
+			count = ATTEMPTS_BEFORE_RETEST;
+		}
+		while(!satisfied);
+	}
+#else
+	while(*pGlobalIndex < targetIndex);
+#endif
+}
+
+
+#if PX_PROFILE_SOLVE_STALLS
+PX_INLINE void WaitForProgressCount(volatile PxI32* pGlobalIndex, const PxI32 targetIndex, PxU64& stallTime)
+{
+	if(*pGlobalIndex < targetIndex)
+	{
+		bool satisfied = false;
+		PxU32 count = ATTEMPTS_BEFORE_BACKOFF;
+		do
+		{
+			satisfied = true;
+			PxU64 startTime = readTimer();
+			while(*pGlobalIndex < targetIndex)
+			{
+				if(--count == 0)
+				{
+					satisfied = false;
+					break;
+				}
+			}
+			PxU64 endTime = readTimer();
+			stallTime += (endTime - startTime);
+			if(!satisfied)
+				Ps::Thread::yield();
+			count = ATTEMPTS_BEFORE_BACKOFF;
+		}
+		while(!satisfied);
+	}
+}
+
+#define WAIT_FOR_PROGRESS(pGlobalIndex, targetIndex) if(*pGlobalIndex < targetIndex) WaitForProgressCount(pGlobalIndex, targetIndex, stallCount)
+#else
+#define WAIT_FOR_PROGRESS(pGlobalIndex, targetIndex) if(*pGlobalIndex < targetIndex) WaitForProgressCount(pGlobalIndex, targetIndex)
+#endif
+#define WAIT_FOR_PROGRESS_NO_TIMER(pGlobalIndex, targetIndex) if(*pGlobalIndex < targetIndex) WaitForProgressCount(pGlobalIndex, targetIndex)
+
+
+struct SolverIslandParams
+{
+	//Default friction model params
+	PxU32 positionIterations;
+	PxU32 velocityIterations;
+	PxSolverBody* PX_RESTRICT bodyListStart;
+	PxSolverBodyData* PX_RESTRICT bodyDataList;
+	PxU32 bodyListSize;
+	PxU32 solverBodyOffset;
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart; 
+	PxU32 articulationListSize;
+	PxSolverConstraintDesc* PX_RESTRICT constraintList;
+	PxConstraintBatchHeader* constraintBatchHeaders;
+	PxU32 numConstraintHeaders;
+	PxU32* headersPerPartition;
+	PxU32 nbPartitions;
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray;
+	PxU32 batchSize;
+	PxsBodyCore*const* bodyArray;
+	PxsRigidBody** PX_RESTRICT rigidBodies;
+
+	//Shared state progress counters
+	PxI32 constraintIndex;
+	PxI32 constraintIndex2;
+	PxI32 bodyListIndex;
+	PxI32 bodyListIndex2;
+	PxI32 bodyIntegrationListIndex;
+	PxI32 numObjectsIntegrated;
+
+
+	//Additional 1d/2d friction model params
+	PxSolverConstraintDesc* PX_RESTRICT frictionConstraintList;
+	
+	PxConstraintBatchHeader* frictionConstraintBatches;
+	PxU32 numFrictionConstraintHeaders;
+	PxU32* frictionHeadersPerPartition;
+	PxU32 nbFrictionPartitions;
+
+	//Additional Shared state progress counters
+	PxI32 frictionConstraintIndex;
+
+	//Write-back threshold information
+	ThresholdStreamElement* PX_RESTRICT thresholdStream;
+	PxU32 thresholdStreamLength;
+
+	PxI32* outThresholdPairs;
+};
+
+
+/*!
+Interface to constraint solver cores
+
+*/    
+class SolverCore
+{
+public:
+	virtual void destroyV() = 0;
+    virtual ~SolverCore() {}
+	/*
+	solves dual problem exactly by GS-iterating until convergence stops
+	only uses regular velocity vector for storing results, and backs up initial state, which is restored.
+	the solution forces are saved in a vector.
+
+	state should not be stored, this function is safe to call from multiple threads.
+
+	Returns the total number of constraints that should be solved across all threads. Used for synchronization outside of this method
+	*/
+
+	virtual PxI32 solveVParallelAndWriteBack
+		(SolverIslandParams& params) const = 0;
+
+
+	virtual void solveV_Blocks
+		(SolverIslandParams& params) const = 0;
+
+
+	virtual void writeBackV
+		(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 constraintListSize, PxConstraintBatchHeader* contactConstraintBatches, const PxU32 numConstraintBatches,
+	 	 ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+		 PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const = 0;
+};
+
+}
+
+}
+
+#endif //DY_SOLVERCORE_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverExt.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverExt.h
new file mode 100644
index 00000000..18fd5bcc
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverExt.h
@@ -0,0 +1,85 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVEREXTBODY_H
+#define DY_SOLVEREXTBODY_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "CmPhysXCommon.h"
+#include "CmSpatialVector.h"
+
+namespace physx
+{
+
+class PxsRigidBody;
+struct PxsBodyCore;
+struct PxSolverBody;
+struct PxSolverBodyData;
+
+
+namespace Dy
+{
+
+
+struct FsData;
+struct SolverConstraint1D;
+
+class SolverExtBody
+{
+public:
+	union
+	{
+		const FsData* mFsData;
+		const PxSolverBody* mBody;
+	};
+	const PxSolverBodyData* mBodyData;
+
+	PxU16 mLinkIndex;
+
+	SolverExtBody(const void* bodyOrArticulation, const void* bodyData, PxU16 linkIndex): 
+	  mBody(reinterpret_cast<const PxSolverBody*>(bodyOrArticulation)),
+	  mBodyData(reinterpret_cast<const PxSolverBodyData*>(bodyData)),
+		  mLinkIndex(linkIndex)
+	  {}
+
+	  void getResponse(const PxVec3& linImpulse, const PxVec3& angImpulse,
+					   PxVec3& linDeltaV, PxVec3& angDeltaV, PxReal dominance) const;
+
+	  PxReal projectVelocity(const PxVec3& linear, const PxVec3& angular) const;
+	  PxVec3 getLinVel() const;
+	  PxVec3 getAngVel() const;
+};
+
+}
+
+}
+
+#endif //DY_SOLVEREXTBODY_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverPFConstraints.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverPFConstraints.cpp
new file mode 100644
index 00000000..e5eb3328
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverPFConstraints.cpp
@@ -0,0 +1,868 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+
+#ifdef PX_SUPPORT_SIMD
+
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "DyThresholdTable.h"
+#include "DySolverContext.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+#include "DyThresholdTable.h"
+#include "DySolverConstraintsShared.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+void solveContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+	PxSolverBody& b1 = *desc.bodyB;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V linVel1 = V3LoadA(b1.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+	Vec3V angState1 = V3LoadA(b1.angularState);
+
+	SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader*>(desc.constraint);
+	const PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;//getConstraintLength(desc);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	
+	//const FloatV zero = FZero();
+
+	while(currPtr < last)
+	{
+		SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+		currPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		const Vec3V normal = hdr->getNormal();
+		const FloatV invMassDom0 = FLoad(hdr->dominance0);
+		const FloatV invMassDom1 = FLoad(hdr->dominance1);
+		const FloatV angD0 = FLoad(hdr->angDom0);
+		const FloatV angD1 = FLoad(hdr->angDom1);
+		
+
+
+		SolverContactPoint* PX_RESTRICT contacts = reinterpret_cast<SolverContactPoint*>(currPtr);
+		currPtr += numNormalConstr * sizeof(SolverContactPoint);
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*> ((reinterpret_cast<PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader));
+		Ps::prefetchLine(appliedImpulse);
+
+		solveDynamicContacts(contacts, numNormalConstr, normal, invMassDom0, invMassDom1, 
+			angD0, angD1, linVel0, angState0, linVel1, angState1, appliedImpulse); 
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(linVel1, b1.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+	V3StoreA(angState1, b1.angularState);
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveFriction(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+	PxSolverBody& b1 = *desc.bodyB;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V linVel1 = V3LoadA(b1.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+	Vec3V angState1 = V3LoadA(b1.angularState);
+
+	PxU8* PX_RESTRICT ptr = desc.constraint;
+	PxU8* PX_RESTRICT currPtr = ptr;
+
+	const PxU8* PX_RESTRICT last = ptr + getConstraintLength(desc);
+
+
+	while(currPtr < last)
+	{
+		const SolverFrictionHeader* PX_RESTRICT frictionHeader = reinterpret_cast<SolverFrictionHeader*>(currPtr);
+		currPtr += sizeof(SolverFrictionHeader);
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(currPtr);
+		currPtr += frictionHeader->getAppliedForcePaddingSize();
+
+		SolverContactFriction* PX_RESTRICT frictions = reinterpret_cast<SolverContactFriction*>(currPtr);
+		const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr;
+		const PxU32 numNormalConstr = frictionHeader->numNormalConstr;
+
+		const PxU32 numFrictionPerPoint = numFrictionConstr/numNormalConstr;
+
+		currPtr += numFrictionConstr * sizeof(SolverContactFriction);
+		const FloatV staticFriction = frictionHeader->getStaticFriction();
+
+		const FloatV invMass0D0 = FLoad(frictionHeader->invMass0D0);
+		const FloatV invMass1D1 = FLoad(frictionHeader->invMass1D1);
+
+		
+		const FloatV angD0 = FLoad(frictionHeader->angDom0);
+		const FloatV angD1 = FLoad(frictionHeader->angDom1);
+
+		for(PxU32 i=0, j = 0;i<numFrictionConstr;j++)
+		{
+			for(PxU32 p = 0; p < numFrictionPerPoint; p++, i++)
+			{
+		
+				SolverContactFriction& f = frictions[i];
+				Ps::prefetchLine(&frictions[i], 128);
+
+				const Vec3V t0 = Vec3V_From_Vec4V(f.normalXYZ_appliedForceW);
+				const Vec3V raXt0 = Vec3V_From_Vec4V(f.raXnXYZ_velMultiplierW);
+				const Vec3V rbXt0 = Vec3V_From_Vec4V(f.rbXnXYZ_biasW);
+
+				const FloatV appliedForce = V4GetW(f.normalXYZ_appliedForceW);
+				const FloatV velMultiplier = V4GetW(f.raXnXYZ_velMultiplierW);
+
+				const FloatV targetVel = FLoad(f.targetVel);
+
+				const FloatV normalImpulse = FLoad(appliedImpulse[j]);
+				const FloatV maxFriction = FMul(staticFriction, normalImpulse);
+				const FloatV nMaxFriction = FNeg(maxFriction);
+
+				//Compute the normal velocity of the constraint.
+
+				const FloatV t0Vel1 = V3Dot(t0, linVel0);
+				const FloatV t0Vel2 = V3Dot(raXt0, angState0);
+				const FloatV t0Vel3 = V3Dot(t0, linVel1);
+				const FloatV t0Vel4 = V3Dot(rbXt0, angState1);
+
+
+				const FloatV t0Vel = FSub(FAdd(t0Vel1, t0Vel2), FAdd(t0Vel3, t0Vel4));
+
+				const Vec3V delLinVel0 = V3Scale(t0, invMass0D0);
+				const Vec3V delLinVel1 = V3Scale(t0, invMass1D1);
+
+				// still lots to do here: using loop pipelining we can interweave this code with the
+				// above - the code here has a lot of stalls that we would thereby eliminate
+				
+				const FloatV tmp = FNegScaleSub(targetVel,velMultiplier,appliedForce);
+				FloatV newForce = FScaleAdd(t0Vel, velMultiplier, tmp);
+				newForce = FClamp(newForce, nMaxFriction, maxFriction);
+				FloatV deltaF = FSub(newForce, appliedForce);
+
+				linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+				linVel1 = V3NegScaleSub(delLinVel1, deltaF, linVel1);
+				angState0 = V3ScaleAdd(raXt0, FMul(deltaF, angD0), angState0);
+				angState1 = V3NegScaleSub(rbXt0, FMul(deltaF, angD1), angState1);
+
+				f.setAppliedForce(newForce);
+			}
+		}
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(linVel1, b1.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+	V3StoreA(angState1, b1.angularState);
+
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveContactCoulomb_BStatic(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+
+	SolverContactCoulombHeader* firstHeader = reinterpret_cast<SolverContactCoulombHeader*>(desc.constraint);
+	const PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;//getConstraintLength(desc);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	//const FloatV zero = FZero();
+
+	while(currPtr < last)
+	{
+		SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+		currPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		SolverContactPoint* PX_RESTRICT contacts = reinterpret_cast<SolverContactPoint*>(currPtr);
+		Ps::prefetchLine(contacts);
+		currPtr += numNormalConstr * sizeof(SolverContactPoint);
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*> ((reinterpret_cast<PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader));
+		Ps::prefetchLine(appliedImpulse);
+
+		const Vec3V normal = hdr->getNormal();
+
+		const FloatV invMassDom0 = FLoad(hdr->dominance0);
+
+		const FloatV angD0 = FLoad(hdr->angDom0);
+		
+		solveStaticContacts(contacts, numNormalConstr, normal, invMassDom0, 
+			angD0, linVel0, angState0, appliedImpulse); 
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveFriction_BStatic(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	const PxU8* PX_RESTRICT last = currPtr + getConstraintLength(desc);
+
+	while(currPtr < last)
+	{
+
+		const SolverFrictionHeader* PX_RESTRICT frictionHeader = reinterpret_cast<SolverFrictionHeader*>(currPtr);
+		const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr;
+		const PxU32 numNormalConstr = frictionHeader->numNormalConstr;
+		const PxU32 numFrictionPerPoint = numFrictionConstr/numNormalConstr;
+		currPtr +=sizeof(SolverFrictionHeader);
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(currPtr);
+		currPtr +=frictionHeader->getAppliedForcePaddingSize();
+
+		SolverContactFriction* PX_RESTRICT frictions = reinterpret_cast<SolverContactFriction*>(currPtr);
+		currPtr += numFrictionConstr * sizeof(SolverContactFriction);
+
+		const FloatV invMass0 = FLoad(frictionHeader->invMass0D0);
+		const FloatV angD0 = FLoad(frictionHeader->angDom0);
+		//const FloatV angD1 = FLoad(frictionHeader->angDom1);
+
+
+		const FloatV staticFriction = frictionHeader->getStaticFriction();
+
+		for(PxU32 i=0, j = 0;i<numFrictionConstr;j++)
+		{
+			for(PxU32 p = 0; p < numFrictionPerPoint; p++, i++)
+			{
+				SolverContactFriction& f = frictions[i];
+				Ps::prefetchLine(&frictions[i+1]);
+
+				const Vec3V t0 = Vec3V_From_Vec4V(f.normalXYZ_appliedForceW);
+				const Vec3V raXt0 = Vec3V_From_Vec4V(f.raXnXYZ_velMultiplierW);
+
+				const FloatV appliedForce = V4GetW(f.normalXYZ_appliedForceW);
+				const FloatV velMultiplier = V4GetW(f.raXnXYZ_velMultiplierW);
+
+				const FloatV targetVel = FLoad(f.targetVel);
+				
+				//const FloatV normalImpulse = contacts[f.contactIndex].getAppliedForce();
+				const FloatV normalImpulse = FLoad(appliedImpulse[j]);
+				const FloatV maxFriction = FMul(staticFriction, normalImpulse);
+				const FloatV nMaxFriction = FNeg(maxFriction);
+
+				//Compute the normal velocity of the constraint.
+
+				const FloatV t0Vel1 = V3Dot(t0, linVel0);
+				const FloatV t0Vel2 = V3Dot(raXt0, angState0);
+
+				const FloatV t0Vel = FAdd(t0Vel1, t0Vel2);
+
+				const Vec3V delangState0 = V3Scale(raXt0, angD0);
+				const Vec3V delLinVel0 = V3Scale(t0, invMass0);
+
+				// still lots to do here: using loop pipelining we can interweave this code with the
+				// above - the code here has a lot of stalls that we would thereby eliminate
+
+				const FloatV tmp = FNegScaleSub(targetVel,velMultiplier,appliedForce);
+				FloatV newForce = FScaleAdd(t0Vel, velMultiplier, tmp);
+				newForce = FClamp(newForce, nMaxFriction, maxFriction);
+				const FloatV deltaF = FSub(newForce, appliedForce);
+
+				linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+				angState0 = V3ScaleAdd(delangState0, deltaF, angState0);
+
+				f.setAppliedForce(newForce);
+			}
+		}
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+
+	PX_ASSERT(currPtr == last);
+}
+
+
+void concludeContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxU8* PX_RESTRICT cPtr = desc.constraint;
+
+	const SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+	PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;//getConstraintLength(desc);
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		
+		//if(cPtr < last)
+		//Ps::prefetchLine(cPtr, 512);
+		Ps::prefetchLine(cPtr,128);
+		Ps::prefetchLine(cPtr,256);
+		Ps::prefetchLine(cPtr,384);
+
+		const PxU32 pointStride = hdr->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactPointExt)
+																	   : sizeof(SolverContactPoint);
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContactPoint *c = reinterpret_cast<SolverContactPoint*>(cPtr);
+			cPtr += pointStride;
+			//c->scaledBias = PxMin(c->scaledBias, 0.f);
+			c->biasedErr = c->unbiasedErr;
+		}
+	}
+	PX_ASSERT(cPtr == last);
+}
+
+void  writeBackContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& cache,
+					  PxSolverBodyData& bd0, PxSolverBodyData& bd1)
+{
+
+	PxReal normalForce = 0.f;
+
+	PxU8* PX_RESTRICT cPtr = desc.constraint;
+	PxReal* PX_RESTRICT vForceWriteback = reinterpret_cast<PxReal*>(desc.writeBack);
+	const SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+	PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;
+
+	const PxU32 pointStride = firstHeader->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactPointExt)
+																	   : sizeof(SolverContactPoint);
+
+	bool hasForceThresholds = false;
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader);
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*> (const_cast<PxU8*>((reinterpret_cast<const PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader)));
+
+		hasForceThresholds = hdr->flags & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		Ps::prefetchLine(cPtr, 256);
+		Ps::prefetchLine(cPtr, 384);
+
+		if(vForceWriteback!=NULL)
+		{
+			for(PxU32 i=0; i<numNormalConstr; i++)
+			{
+				PxF32 imp = appliedImpulse[i];
+				*vForceWriteback = imp;
+				vForceWriteback++;
+				normalForce += imp;
+			}
+		}
+		cPtr += numNormalConstr * pointStride;
+	}
+	PX_ASSERT(cPtr == last);
+
+	if(hasForceThresholds && desc.linkIndexA == PxSolverConstraintDesc::NO_LINK && desc.linkIndexB == PxSolverConstraintDesc::NO_LINK &&
+		normalForce !=0 && (bd0.reportThreshold < PX_MAX_REAL  || bd1.reportThreshold < PX_MAX_REAL))
+	{
+		ThresholdStreamElement elt;
+		elt.normalForce = normalForce;
+		elt.threshold = PxMin<float>(bd0.reportThreshold, bd1.reportThreshold);
+		elt.nodeIndexA = bd0.nodeIndex;
+		elt.nodeIndexB = bd1.nodeIndex;
+		elt.shapeInteraction = (reinterpret_cast<SolverContactCoulombHeader*>(desc.constraint))->shapeInteraction;
+		Ps::order(elt.nodeIndexA, elt.nodeIndexB);
+		PX_ASSERT(elt.nodeIndexA < elt.nodeIndexB);
+
+		PX_ASSERT(cache.mThresholdStreamIndex<cache.mThresholdStreamLength);
+		cache.mThresholdStream[cache.mThresholdStreamIndex++] = elt;
+	}
+
+}
+
+
+void solveFrictionBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction(desc[a], cache);
+	}
+}
+
+
+void solveFrictionBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction(desc[a], cache);
+	}
+}
+
+void solveFriction_BStaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction_BStatic(desc[a], cache);
+	}
+}
+
+
+void solveFriction_BStaticConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction_BStatic(desc[a], cache);
+	}
+}
+
+void solveFriction_BStaticBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction_BStatic(desc[a], cache);
+	}
+}
+
+
+void solveContactCoulombBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveContactCoulombConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb(desc[a], cache);
+		concludeContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveContactCoulombBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].bodyBDataIndex];
+		solveContactCoulomb(desc[a], cache);
+		writeBackContactCoulomb(desc[a], cache, bd0, bd1);
+	}
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveContactCoulomb_BStaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb_BStatic(desc[a], cache);
+	}
+}
+
+void solveContactCoulomb_BStaticConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb_BStatic(desc[a], cache);
+		concludeContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveContactCoulomb_BStaticBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].bodyBDataIndex];
+		solveContactCoulomb_BStatic(desc[a], cache);
+		writeBackContactCoulomb(desc[a], cache, bd0, bd1);
+	}
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Not enough space to write 4 more thresholds back!
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveExtContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	//We'll need this.
+//	const FloatV zero	= FZero();
+//	const FloatV one	= FOne();
+
+	Vec3V linVel0, angVel0, linVel1, angVel1;
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel0 = V3LoadA(desc.bodyA->linearVelocity);
+		angVel0 = V3LoadA(desc.bodyA->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationA, desc.linkIndexA);
+		linVel0 = v.linear;
+		angVel0 = v.angular;
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel1 = V3LoadA(desc.bodyB->linearVelocity);
+		angVel1 = V3LoadA(desc.bodyB->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationB, desc.linkIndexB);
+		linVel1 = v.linear;
+		angVel1 = v.angular;
+	}
+
+	//const PxU8* PX_RESTRICT last = desc.constraint + desc.constraintLengthOver16*16;
+
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	const SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+
+	const PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;
+
+	//hopefully pointer aliasing doesn't bite.
+
+	Vec3V linImpulse0 = V3Zero(), linImpulse1 = V3Zero(), angImpulse0 = V3Zero(), angImpulse1 = V3Zero();
+
+	while(currPtr < last)
+	{
+		const SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+		currPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(const_cast<PxU8*>(((reinterpret_cast<const PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader))));
+		Ps::prefetchLine(appliedImpulse);
+		
+		SolverContactPointExt* PX_RESTRICT contacts = reinterpret_cast<SolverContactPointExt*>(currPtr);
+		Ps::prefetchLine(contacts);
+		currPtr += numNormalConstr * sizeof(SolverContactPointExt);
+
+		Vec3V li0 = V3Zero(), li1 = V3Zero(), ai0 = V3Zero(), ai1 = V3Zero();
+
+		const Vec3V normal = hdr->getNormal();
+
+		solveExtContacts(contacts, numNormalConstr, normal, linVel0, angVel0, linVel1, angVel1, li0, ai0, li1, ai1, appliedImpulse);
+
+		linImpulse0 = V3ScaleAdd(li0, FLoad(hdr->dominance0), linImpulse0);		
+		angImpulse0 = V3ScaleAdd(ai0, FLoad(hdr->angDom0), angImpulse0);
+		linImpulse1 = V3NegScaleSub(li1, FLoad(hdr->dominance1), linImpulse1);	
+		angImpulse1 = V3NegScaleSub(ai1, FLoad(hdr->angDom1), angImpulse1);
+	}
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel0, desc.bodyA->linearVelocity);
+		V3StoreA(angVel0, desc.bodyA->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationA, desc.linkIndexA, linImpulse0, angImpulse0);
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel1, desc.bodyB->linearVelocity);
+		V3StoreA(angVel1, desc.bodyB->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationB, desc.linkIndexB, linImpulse1, angImpulse1);
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveExtFriction(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	Vec3V linVel0, angVel0, linVel1, angVel1;
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel0 = V3LoadA(desc.bodyA->linearVelocity);
+		angVel0 = V3LoadA(desc.bodyA->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationA, desc.linkIndexA);
+		linVel0 = v.linear;
+		angVel0 = v.angular;
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel1 = V3LoadA(desc.bodyB->linearVelocity);
+		angVel1 = V3LoadA(desc.bodyB->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = PxcFsGetVelocity(*desc.articulationB, desc.linkIndexB);
+		linVel1 = v.linear;
+		angVel1 = v.angular;
+	}
+
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	const PxU8* PX_RESTRICT last = currPtr + desc.constraintLengthOver16*16;
+
+	Vec3V linImpulse0 = V3Zero(), linImpulse1 = V3Zero(), angImpulse0 = V3Zero(), angImpulse1 = V3Zero();
+
+	while(currPtr < last)
+	{
+	
+		const SolverFrictionHeader* PX_RESTRICT frictionHeader = reinterpret_cast<SolverFrictionHeader*>(currPtr);
+		currPtr += sizeof(SolverFrictionHeader);
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(currPtr);
+		currPtr += frictionHeader->getAppliedForcePaddingSize();
+
+		SolverContactFrictionExt* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionExt*>(currPtr);
+		const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr;
+
+		currPtr += numFrictionConstr * sizeof(SolverContactFrictionExt);
+		const FloatV staticFriction = frictionHeader->getStaticFriction();
+	
+	
+		Vec3V li0 = V3Zero(), li1 = V3Zero(), ai0 = V3Zero(), ai1 = V3Zero();
+
+		PxU32 numNormalConstr = frictionHeader->numNormalConstr;
+		PxU32 nbFrictionsPerPoint = numFrictionConstr/numNormalConstr;
+
+
+
+
+		for(PxU32 i = 0, j = 0; i < numFrictionConstr; j++)
+		{
+			for(PxU32 p=0;p<nbFrictionsPerPoint;p++, i++)
+			{
+				SolverContactFrictionExt& f = frictions[i];
+				Ps::prefetchLine(&frictions[i+1]);
+			
+
+				const Vec3V t0 = Vec3V_From_Vec4V(f.normalXYZ_appliedForceW);
+				const Vec3V raXt0 = Vec3V_From_Vec4V(f.raXnXYZ_velMultiplierW);
+				const Vec3V rbXt0 = Vec3V_From_Vec4V(f.rbXnXYZ_biasW);
+
+				const FloatV appliedForce = V4GetW(f.normalXYZ_appliedForceW);
+				const FloatV velMultiplier = V4GetW(f.raXnXYZ_velMultiplierW);
+				const FloatV targetVel = FLoad(f.targetVel);
+
+				const FloatV normalImpulse = FLoad(appliedImpulse[j]);//contacts[f.contactIndex].getAppliedForce();
+				const FloatV maxFriction = FMul(staticFriction, normalImpulse);
+				const FloatV nMaxFriction = FNeg(maxFriction);
+
+				//Compute the normal velocity of the constraint.
+
+				Vec3V rVel = V3MulAdd(linVel0, t0, V3Mul(angVel0, raXt0));
+				rVel = V3Sub(rVel, V3MulAdd(linVel1, t0, V3Mul(angVel1, rbXt0)));
+				const FloatV t0Vel = FAdd(V3SumElems(rVel), targetVel);
+
+				FloatV deltaF = FNeg(FMul(t0Vel, velMultiplier));
+				FloatV newForce = FAdd(appliedForce, deltaF);
+				newForce = FClamp(newForce, nMaxFriction, maxFriction);
+				deltaF = FSub(newForce, appliedForce);
+
+				linVel0 = V3ScaleAdd(f.linDeltaVA, deltaF, linVel0);	
+				angVel0 = V3ScaleAdd(f.angDeltaVA, deltaF, angVel0);
+				linVel1 = V3ScaleAdd(f.linDeltaVB, deltaF, linVel1);	
+				angVel1 = V3ScaleAdd(f.angDeltaVB, deltaF, angVel1);
+
+				li0 = V3ScaleAdd(t0, deltaF, li0);	ai0 = V3ScaleAdd(raXt0, deltaF, ai0);
+				li1 = V3ScaleAdd(t0, deltaF, li1);	ai1 = V3ScaleAdd(rbXt0, deltaF, ai1);
+
+				f.setAppliedForce(newForce);
+			}
+		}
+
+
+		linImpulse0 = V3ScaleAdd(li0, FLoad(frictionHeader->invMass0D0), linImpulse0);		
+		angImpulse0 = V3ScaleAdd(ai0, FLoad(frictionHeader->angDom0), angImpulse0);
+		linImpulse1 = V3NegScaleSub(li1, FLoad(frictionHeader->invMass1D1), linImpulse1);	
+		angImpulse1 = V3NegScaleSub(ai1, FLoad(frictionHeader->angDom1), angImpulse1);
+	}
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel0, desc.bodyA->linearVelocity);
+		V3StoreA(angVel0, desc.bodyA->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationA, desc.linkIndexA, linImpulse0, angImpulse0);
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel1, desc.bodyB->linearVelocity);
+		V3StoreA(angVel1, desc.bodyB->angularState);
+	}
+	else
+		PxcFsApplyImpulse(*desc.articulationB, desc.linkIndexB, linImpulse1, angImpulse1);
+
+	PX_ASSERT(currPtr == last);
+
+}
+
+void solveExtFrictionBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtFriction(desc[a], cache);
+	}
+}
+
+void solveExtFrictionConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtFriction(desc[a], cache);
+	}
+}
+
+void solveExtFrictionBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtFriction(desc[a], cache);
+	}
+}
+
+
+void solveConcludeExtContactCoulomb		(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveExtContactCoulomb(desc, cache);
+	concludeContactCoulomb(desc, cache);
+}
+
+void solveExtContactCoulombBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveExtContactCoulombConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtContactCoulomb(desc[a], cache);
+		concludeContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveExtContactCoulombBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].linkIndexA != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].linkIndexB != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyBDataIndex];
+
+		solveExtContactCoulomb(desc[a], cache);
+		writeBackContactCoulomb(desc[a], cache, bd0, bd1);
+	}
+	if(cache.mThresholdStreamIndex > 0)
+	{
+		//Not enough space to write 4 more thresholds back!
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+
+void solveConcludeContactCoulomb			(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveContactCoulomb(desc, cache);
+	concludeContactCoulomb(desc, cache);
+}
+
+
+void solveConcludeContactCoulomb_BStatic	(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveContactCoulomb_BStatic(desc, cache);
+	concludeContactCoulomb(desc, cache);
+}
+
+
+
+}
+
+}
+
+#endif //PX_SUPPORT_SIMD
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySolverPFConstraintsBlock.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverPFConstraintsBlock.cpp
new file mode 100644
index 00000000..c6d7288e
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySolverPFConstraintsBlock.cpp
@@ -0,0 +1,985 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "PsFPU.h"
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContactPF4.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "DyThresholdTable.h"
+#include "DySolverContext.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+#include "DySolverContact.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+static void solveContactCoulomb4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b01 = *desc[0].bodyB;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b11 = *desc[1].bodyB;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b21 = *desc[2].bodyB;
+	PxSolverBody& b30 = *desc[3].bodyA;
+	PxSolverBody& b31 = *desc[3].bodyB;
+
+	//We'll need this.
+	const Vec4V vZero	= V4Zero();
+		
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+	Vec4V angState01 = V4LoadA(&b01.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+	Vec4V angState11 = V4LoadA(&b11.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+	Vec4V angState21 = V4LoadA(&b21.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+	Vec4V angState31 = V4LoadA(&b31.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V linVel1T0, linVel1T1, linVel1T2, linVel1T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+	Vec4V angState1T0, angState1T1, angState1T2, angState1T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2, linVel1T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	PX_TRANSPOSE_44(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2, angState1T3);
+
+
+	
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+
+	SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+	const PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+	//const PxU8* PX_RESTRICT endPtr = desc[0].constraint + getConstraintLength(desc[0]);
+
+
+	//TODO - can I avoid this many tests???
+	while(currPtr < last)
+	{
+
+		SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+		Vec4V* appliedForceBuffer = reinterpret_cast<Vec4V*>(currPtr + hdr->frictionOffset + sizeof(SolverFrictionHeader4));
+
+		//PX_ASSERT((PxU8*)appliedForceBuffer < endPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		SolverContact4Dynamic* PX_RESTRICT contacts = reinterpret_cast<SolverContact4Dynamic*>(currPtr);
+		//const Vec4V dominance1 = V4Neg(__dominance1);
+
+		currPtr = reinterpret_cast<PxU8*>(contacts + numNormalConstr);
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V invMass1D1 = hdr->invMassBDom;
+		const Vec4V angD0 = hdr->angD0;
+		const Vec4V angD1 = hdr->angD1;
+
+		const Vec4V normalT0 = hdr->normalX;
+		const Vec4V normalT1 = hdr->normalY;
+		const Vec4V normalT2 = hdr->normalZ;
+
+		const Vec4V __normalVel1 = V4Mul(linVel0T0, normalT0);
+		const Vec4V __normalVel3 = V4Mul(linVel1T0, normalT0);
+		const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalT1, __normalVel1);
+		const Vec4V _normalVel3 = V4MulAdd(linVel1T1, normalT1, __normalVel3);
+
+		Vec4V normalVel1 = V4MulAdd(linVel0T2, normalT2, _normalVel1);
+		Vec4V normalVel3 = V4MulAdd(linVel1T2, normalT2, _normalVel3);
+
+		Vec4V accumDeltaF = vZero;
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContact4Dynamic& c = contacts[i];
+			Ps::prefetchLine((&contacts[i+1]));
+			Ps::prefetchLine((&contacts[i+1]), 128);
+			Ps::prefetchLine((&contacts[i+1]), 256);
+			Ps::prefetchLine((&contacts[i+1]), 384);
+
+			const Vec4V appliedForce = c.appliedForce;
+			const Vec4V velMultiplier = c.velMultiplier;
+			
+			const Vec4V targetVel = c.targetVelocity;
+			const Vec4V scaledBias = c.scaledBias;
+			const Vec4V maxImpulse = c.maxImpulse;
+
+			const Vec4V raXnT0 = c.raXnX;
+			const Vec4V raXnT1 = c.raXnY;
+			const Vec4V raXnT2 = c.raXnZ;
+			const Vec4V rbXnT0 = c.rbXnX;
+			const Vec4V rbXnT1 = c.rbXnY;
+			const Vec4V rbXnT2 = c.rbXnZ;
+
+			
+			const Vec4V __normalVel2 = V4Mul(raXnT0, angState0T0);
+			const Vec4V __normalVel4 = V4Mul(rbXnT0, angState1T0);
+
+			
+			const Vec4V _normalVel2 = V4MulAdd(raXnT1, angState0T1, __normalVel2);
+			const Vec4V _normalVel4 = V4MulAdd(rbXnT1, angState1T1, __normalVel4);
+
+			
+			const Vec4V normalVel2 = V4MulAdd(raXnT2, angState0T2, _normalVel2);
+			const Vec4V normalVel4 = V4MulAdd(rbXnT2, angState1T2, _normalVel4);
+
+			const Vec4V biasedErr = V4MulAdd(targetVel, velMultiplier, V4Neg(scaledBias));
+
+			//Linear component - normal * invMass_dom
+
+			const Vec4V _normalVel(V4Add(normalVel1, normalVel2));
+			const Vec4V __normalVel(V4Add(normalVel3, normalVel4));
+		
+			const Vec4V normalVel = V4Sub(_normalVel, __normalVel );
+
+			const Vec4V _deltaF = V4NegMulSub(normalVel, velMultiplier, biasedErr);
+			const Vec4V nAppliedForce = V4Neg(appliedForce);
+			const Vec4V _deltaF2 = V4Max(_deltaF, nAppliedForce);
+			const Vec4V _newAppliedForce(V4Add(appliedForce, _deltaF2));
+			const Vec4V newAppliedForce = V4Min(_newAppliedForce, maxImpulse);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+
+			normalVel1 = V4MulAdd(invMass0D0, deltaF, normalVel1);
+			normalVel3 = V4NegMulSub(invMass1D1, deltaF, normalVel3);
+
+			accumDeltaF = V4Add(deltaF, accumDeltaF);
+
+			const Vec4V deltaFAng0 = V4Mul(angD0, deltaF);
+			const Vec4V deltaFAng1 = V4Mul(angD1, deltaF);
+
+			angState0T0 = V4MulAdd(raXnT0, deltaFAng0, angState0T0);
+			angState1T0 = V4NegMulSub(rbXnT0, deltaFAng1, angState1T0);
+
+			angState0T1 = V4MulAdd(raXnT1, deltaFAng0, angState0T1);
+			angState1T1 = V4NegMulSub(rbXnT1, deltaFAng1, angState1T1);
+
+			angState0T2 = V4MulAdd(raXnT2, deltaFAng0, angState0T2);
+			angState1T2 = V4NegMulSub(rbXnT2, deltaFAng1, angState1T2);
+
+			c.appliedForce = newAppliedForce;
+			appliedForceBuffer[i] = newAppliedForce;
+		}
+
+		const Vec4V accumDeltaF0 = V4Mul(accumDeltaF, invMass0D0);
+		const Vec4V accumDeltaF1 = V4Mul(accumDeltaF, invMass1D1);
+
+		linVel0T0 = V4MulAdd(normalT0, accumDeltaF0, linVel0T0);
+		linVel1T0 = V4NegMulSub(normalT0, accumDeltaF1, linVel1T0);
+		linVel0T1 = V4MulAdd(normalT1, accumDeltaF0, linVel0T1);
+		linVel1T1 = V4NegMulSub(normalT1, accumDeltaF1, linVel1T1);
+		linVel0T2 = V4MulAdd(normalT2, accumDeltaF0, linVel0T2);
+		linVel1T2 = V4NegMulSub(normalT2, accumDeltaF1, linVel1T2);
+	}
+
+	PX_ASSERT(currPtr == last);
+	
+
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(linVel1T0, linVel1T1, linVel1T2, linVel1T3, linVel01, linVel11, linVel21, linVel31);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+	PX_TRANSPOSE_44(angState1T0, angState1T1, angState1T2, angState1T3, angState01, angState11, angState21, angState31);
+
+
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(linVel01, &b01.linearVelocity.x);
+	V4StoreA(linVel11, &b11.linearVelocity.x);
+	V4StoreA(linVel21, &b21.linearVelocity.x);
+	V4StoreA(linVel31, &b31.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+
+	V4StoreA(angState01, &b01.angularState.x);
+	V4StoreA(angState11, &b11.angularState.x);
+	V4StoreA(angState21, &b21.angularState.x);
+	V4StoreA(angState31, &b31.angularState.x);
+}
+
+
+static void solveContactCoulomb4_StaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b30 = *desc[3].bodyA;
+
+	//We'll need this.
+	const Vec4V vZero	= V4Zero();
+		
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+
+	SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+	const PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+
+	//TODO - can I avoid this many tests???
+	while(currPtr < last)
+	{
+
+		SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+		Vec4V* appliedForceBuffer = reinterpret_cast<Vec4V*>(currPtr + hdr->frictionOffset + sizeof(SolverFrictionHeader4));
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		SolverContact4Base* PX_RESTRICT contacts = reinterpret_cast<SolverContact4Base*>(currPtr);
+	
+		currPtr = reinterpret_cast<PxU8*>(contacts + numNormalConstr);
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V angD0 = hdr->angD0;
+
+		const Vec4V normalT0 = hdr->normalX;
+		const Vec4V normalT1 = hdr->normalY;
+		const Vec4V normalT2 = hdr->normalZ;
+
+		const Vec4V __normalVel1 = V4Mul(linVel0T0, normalT0);
+		const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalT1, __normalVel1);
+
+		Vec4V normalVel1 = V4MulAdd(linVel0T2, normalT2, _normalVel1);
+
+		Vec4V accumDeltaF = vZero;
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContact4Base& c = contacts[i];
+			Ps::prefetchLine((&contacts[i+1]));
+			Ps::prefetchLine((&contacts[i+1]), 128);
+			Ps::prefetchLine((&contacts[i+1]), 256);
+
+			const Vec4V appliedForce = c.appliedForce;
+			const Vec4V velMultiplier = c.velMultiplier;
+			
+			const Vec4V targetVel = c.targetVelocity;
+			const Vec4V scaledBias = c.scaledBias;
+			const Vec4V maxImpulse = c.maxImpulse;
+
+			const Vec4V raXnT0 = c.raXnX;
+			const Vec4V raXnT1 = c.raXnY;
+			const Vec4V raXnT2 = c.raXnZ;
+
+			
+			const Vec4V __normalVel2 = V4Mul(raXnT0, angState0T0);
+			
+			const Vec4V _normalVel2 = V4MulAdd(raXnT1, angState0T1, __normalVel2);
+			
+			const Vec4V normalVel2 = V4MulAdd(raXnT2, angState0T2, _normalVel2);
+
+			const Vec4V biasedErr = V4MulAdd(targetVel, velMultiplier, V4Neg(scaledBias));
+
+			//Linear component - normal * invMass_dom
+
+			const Vec4V normalVel(V4Add(normalVel1, normalVel2));
+
+			const Vec4V _deltaF = V4NegMulSub(normalVel, velMultiplier, biasedErr);
+			const Vec4V nAppliedForce = V4Neg(appliedForce);
+
+			const Vec4V _deltaF2 = V4Max(_deltaF, nAppliedForce);
+
+			const Vec4V _newAppliedForce(V4Add(appliedForce, _deltaF2));
+			const Vec4V newAppliedForce = V4Min(_newAppliedForce, maxImpulse);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+			const Vec4V deltaAngF = V4Mul(deltaF, angD0);
+
+			normalVel1 = V4MulAdd(invMass0D0, deltaF, normalVel1);
+
+			accumDeltaF = V4Add(deltaF, accumDeltaF);
+
+			angState0T0 = V4MulAdd(raXnT0, deltaAngF, angState0T0);
+			angState0T1 = V4MulAdd(raXnT1, deltaAngF, angState0T1);
+			angState0T2 = V4MulAdd(raXnT2, deltaAngF, angState0T2);
+
+			c.appliedForce = newAppliedForce;
+			appliedForceBuffer[i] = newAppliedForce;
+		}
+		const Vec4V scaledAccumDeltaF = V4Mul(accumDeltaF, invMass0D0);
+		linVel0T0 = V4MulAdd(normalT0, scaledAccumDeltaF, linVel0T0);
+		linVel0T1 = V4MulAdd(normalT1, scaledAccumDeltaF, linVel0T1);
+		linVel0T2 = V4MulAdd(normalT2, scaledAccumDeltaF, linVel0T2);
+	}
+
+	PX_ASSERT(currPtr == last);
+	
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+
+	// Write back
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+}
+
+static void solveFriction4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b01 = *desc[0].bodyB;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b11 = *desc[1].bodyB;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b21 = *desc[2].bodyB;
+	PxSolverBody& b30 = *desc[3].bodyA;
+	PxSolverBody& b31 = *desc[3].bodyB;
+
+
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+	Vec4V angState01 = V4LoadA(&b01.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+	Vec4V angState11 = V4LoadA(&b11.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+	Vec4V angState21 = V4LoadA(&b21.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+	Vec4V angState31 = V4LoadA(&b31.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V linVel1T0, linVel1T1, linVel1T2, linVel1T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+	Vec4V angState1T0, angState1T1, angState1T2, angState1T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2, linVel1T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	PX_TRANSPOSE_44(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2, angState1T3);
+
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+	PxU8* PX_RESTRICT endPtr = desc[0].constraint + getConstraintLength(desc[0]);
+	
+
+	while(currPtr < endPtr)
+	{
+		SolverFrictionHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverFrictionHeader4*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		Vec4V* appliedImpulses = reinterpret_cast<Vec4V*>(currPtr);
+
+		currPtr += hdr->numNormalConstr * sizeof(Vec4V);
+
+		Ps::prefetchLine(currPtr, 128);
+		Ps::prefetchLine(currPtr,256);
+		Ps::prefetchLine(currPtr,384);
+		
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		SolverFriction4Dynamic* PX_RESTRICT frictions = reinterpret_cast<SolverFriction4Dynamic*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(frictions + hdr->numFrictionConstr);
+
+		const PxU32 maxFrictionConstr = numFrictionConstr;
+	
+		const Vec4V staticFric = hdr->staticFriction;
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V invMass1D1 = hdr->invMassBDom;
+
+		const Vec4V angD0 = hdr->angD0;
+		const Vec4V angD1 = hdr->angD1;
+
+		for(PxU32 i=0;i<maxFrictionConstr;i++)
+		{
+			SolverFriction4Dynamic& f = frictions[i];
+			Ps::prefetchLine((&f)+1);
+			Ps::prefetchLine((&f)+1,128);
+			Ps::prefetchLine((&f)+1,256);
+			Ps::prefetchLine((&f)+1,384);
+
+			const Vec4V appliedImpulse = appliedImpulses[i>>hdr->frictionPerContact];
+
+			const Vec4V maxFriction =  V4Mul(staticFric, appliedImpulse);
+
+			const Vec4V nMaxFriction = V4Neg(maxFriction); 
+
+			const Vec4V normalX = f.normalX;
+			const Vec4V normalY = f.normalY;
+			const Vec4V normalZ = f.normalZ;
+
+			const Vec4V raXnX = f.raXnX;
+			const Vec4V raXnY = f.raXnY;
+			const Vec4V raXnZ = f.raXnZ;
+
+			const Vec4V rbXnX = f.rbXnX;
+			const Vec4V rbXnY = f.rbXnY;
+			const Vec4V rbXnZ = f.rbXnZ;
+
+			const Vec4V appliedForce(f.appliedForce);
+			const Vec4V velMultiplier(f.velMultiplier);
+			const Vec4V targetVel(f.targetVelocity);
+	
+			//4 x 4 Dot3 products encoded as 8 M44 transposes, 4 MulV and 8 MulAdd ops
+
+			const Vec4V __normalVel1 = V4Mul(linVel0T0, normalX);
+			const Vec4V __normalVel2 = V4Mul(raXnX, angState0T0);
+			const Vec4V __normalVel3 = V4Mul(linVel1T0, normalX);
+			const Vec4V __normalVel4 = V4Mul(rbXnX, angState1T0);
+
+			const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalY, __normalVel1);
+			const Vec4V _normalVel2 = V4MulAdd(raXnY, angState0T1, __normalVel2);
+			const Vec4V _normalVel3 = V4MulAdd(linVel1T1, normalY, __normalVel3);
+			const Vec4V _normalVel4 = V4MulAdd(rbXnY, angState1T1, __normalVel4);
+
+			const Vec4V normalVel1 = V4MulAdd(linVel0T2, normalZ, _normalVel1);
+			const Vec4V normalVel2 = V4MulAdd(raXnZ, angState0T2, _normalVel2);
+			const Vec4V normalVel3 = V4MulAdd(linVel1T2, normalZ, _normalVel3);
+			const Vec4V normalVel4 = V4MulAdd(rbXnZ, angState1T2, _normalVel4);
+
+
+			const Vec4V _normalVel = V4Add(normalVel1, normalVel2);
+			const Vec4V __normalVel = V4Add(normalVel3, normalVel4);
+
+			const Vec4V normalVel = V4Sub(_normalVel, __normalVel );
+
+			const Vec4V tmp = V4NegMulSub(targetVel, velMultiplier, appliedForce);
+			Vec4V newAppliedForce = V4MulAdd(normalVel, velMultiplier, tmp);
+			newAppliedForce = V4Clamp(newAppliedForce,nMaxFriction,  maxFriction);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+
+			const Vec4V deltaLinF0 = V4Mul(invMass0D0, deltaF);
+			const Vec4V deltaLinF1 = V4Mul(invMass1D1, deltaF);
+
+			const Vec4V deltaAngF0 = V4Mul(angD0, deltaF);
+			const Vec4V deltaAngF1 = V4Mul(angD1, deltaF);
+
+
+			linVel0T0 = V4MulAdd(normalX, deltaLinF0, linVel0T0);
+			linVel1T0 = V4NegMulSub(normalX, deltaLinF1, linVel1T0);
+			angState0T0 = V4MulAdd(raXnX, deltaAngF0, angState0T0);
+			angState1T0 = V4NegMulSub(rbXnX, deltaAngF1, angState1T0);
+
+			linVel0T1 = V4MulAdd(normalY, deltaLinF0, linVel0T1);
+			linVel1T1 = V4NegMulSub(normalY, deltaLinF1, linVel1T1);
+			angState0T1 = V4MulAdd(raXnY, deltaAngF0, angState0T1);
+			angState1T1 = V4NegMulSub(rbXnY, deltaAngF1, angState1T1);
+
+			linVel0T2 = V4MulAdd(normalZ, deltaLinF0, linVel0T2);
+			linVel1T2 = V4NegMulSub(normalZ, deltaLinF1, linVel1T2);
+			angState0T2 = V4MulAdd(raXnZ, deltaAngF0, angState0T2);
+			angState1T2 = V4NegMulSub(rbXnZ, deltaAngF1, angState1T2);
+
+			f.appliedForce = newAppliedForce;
+		}
+	}
+
+	PX_ASSERT(currPtr == endPtr);
+
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(linVel1T0, linVel1T1, linVel1T2, linVel1T3, linVel01, linVel11, linVel21, linVel31);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+	PX_TRANSPOSE_44(angState1T0, angState1T1, angState1T2, angState1T3, angState01, angState11, angState21, angState31);
+
+
+	// Write back
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(linVel01, &b01.linearVelocity.x);
+	V4StoreA(linVel11, &b11.linearVelocity.x);
+	V4StoreA(linVel21, &b21.linearVelocity.x);
+	V4StoreA(linVel31, &b31.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+
+	V4StoreA(angState01, &b01.angularState.x);
+	V4StoreA(angState11, &b11.angularState.x);
+	V4StoreA(angState21, &b21.angularState.x);
+	V4StoreA(angState31, &b31.angularState.x);
+
+}
+
+
+static void solveFriction4_StaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b30 = *desc[3].bodyA;
+
+
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+	PxU8* PX_RESTRICT endPtr = desc[0].constraint + getConstraintLength(desc[0]);
+	
+
+	while(currPtr < endPtr)
+	{
+		SolverFrictionHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverFrictionHeader4*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		Vec4V* appliedImpulses = reinterpret_cast<Vec4V*>(currPtr);
+
+		currPtr += hdr->numNormalConstr * sizeof(Vec4V);
+
+		Ps::prefetchLine(currPtr, 128);
+		Ps::prefetchLine(currPtr,256);
+		Ps::prefetchLine(currPtr,384);
+		
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		SolverFriction4Base* PX_RESTRICT frictions = reinterpret_cast<SolverFriction4Base*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(frictions + hdr->numFrictionConstr);
+
+		const PxU32 maxFrictionConstr = numFrictionConstr;
+	
+		const Vec4V staticFric = hdr->staticFriction;
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V angD0 = hdr->angD0;
+
+		for(PxU32 i=0;i<maxFrictionConstr;i++)
+		{
+			SolverFriction4Base& f = frictions[i];
+			Ps::prefetchLine((&f)+1);
+			Ps::prefetchLine((&f)+1,128);
+			Ps::prefetchLine((&f)+1,256);
+
+			const Vec4V appliedImpulse = appliedImpulses[i>>hdr->frictionPerContact];
+
+			const Vec4V maxFriction =  V4Mul(staticFric, appliedImpulse);
+
+			const Vec4V nMaxFriction = V4Neg(maxFriction); 
+
+			const Vec4V normalX = f.normalX;
+			const Vec4V normalY = f.normalY;
+			const Vec4V normalZ = f.normalZ;
+
+			const Vec4V raXnX = f.raXnX;
+			const Vec4V raXnY = f.raXnY;
+			const Vec4V raXnZ = f.raXnZ;
+
+			const Vec4V appliedForce(f.appliedForce);
+			const Vec4V velMultiplier(f.velMultiplier);
+			const Vec4V targetVel(f.targetVelocity);
+	
+			//4 x 4 Dot3 products encoded as 8 M44 transposes, 4 MulV and 8 MulAdd ops
+
+			const Vec4V __normalVel1 = V4Mul(linVel0T0, normalX);
+			const Vec4V __normalVel2 = V4Mul(raXnX, angState0T0);
+
+			const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalY, __normalVel1);
+			const Vec4V _normalVel2 = V4MulAdd(raXnY, angState0T1, __normalVel2);
+
+			const Vec4V normalVel1 = V4MulAdd(linVel0T2, normalZ, _normalVel1);
+			const Vec4V normalVel2 = V4MulAdd(raXnZ, angState0T2, _normalVel2);
+
+			const Vec4V delLinVel00 = V4Mul(normalX, invMass0D0);
+
+			const Vec4V delLinVel10 = V4Mul(normalY, invMass0D0);
+
+			const Vec4V normalVel = V4Add(normalVel1, normalVel2);
+
+			const Vec4V delLinVel20 = V4Mul(normalZ, invMass0D0);
+
+			const Vec4V tmp = V4NegMulSub(targetVel, velMultiplier, appliedForce);
+
+			Vec4V newAppliedForce = V4MulAdd(normalVel, velMultiplier, tmp);
+			newAppliedForce = V4Clamp(newAppliedForce,nMaxFriction,  maxFriction);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+
+			const Vec4V deltaAngF0 = V4Mul(angD0, deltaF);
+
+			linVel0T0 = V4MulAdd(delLinVel00, deltaF, linVel0T0);
+			angState0T0 = V4MulAdd(raXnX, deltaAngF0, angState0T0);
+
+			linVel0T1 = V4MulAdd(delLinVel10, deltaF, linVel0T1);
+			angState0T1 = V4MulAdd(raXnY, deltaAngF0, angState0T1);
+
+			linVel0T2 = V4MulAdd(delLinVel20, deltaF, linVel0T2);
+			angState0T2 = V4MulAdd(raXnZ, deltaAngF0, angState0T2);
+
+			f.appliedForce = newAppliedForce;
+		}
+	}
+
+	PX_ASSERT(currPtr == endPtr);
+
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+
+	// Write back
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+}
+
+static void concludeContactCoulomb4(const PxSolverConstraintDesc* desc, SolverContext& /*cache*/)
+{
+	PxU8* PX_RESTRICT cPtr = desc[0].constraint;
+
+	const Vec4V zero = V4Zero();
+
+	const SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+	PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+	PxU32 pointStride = firstHeader->type == DY_SC_TYPE_BLOCK_RB_CONTACT ? sizeof(SolverContact4Dynamic) : sizeof(SolverContact4Base);
+
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader4);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		
+		//if(cPtr < last)
+		//Ps::prefetchLine(cPtr, 512);
+		Ps::prefetchLine(cPtr,128);
+		Ps::prefetchLine(cPtr,256);
+		Ps::prefetchLine(cPtr,384);
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContact4Base *c = reinterpret_cast<SolverContact4Base*>(cPtr);
+			cPtr += pointStride;
+			c->scaledBias = V4Max(c->scaledBias, zero);
+		}
+	}
+	PX_ASSERT(cPtr == last);
+}
+
+void  writeBackContactCoulomb4(const PxSolverConstraintDesc* desc, SolverContext& cache,
+					  const PxSolverBodyData** PX_RESTRICT bd0, const PxSolverBodyData** PX_RESTRICT bd1)
+{
+	Vec4V normalForceV = V4Zero();
+	PxU8* PX_RESTRICT cPtr = desc[0].constraint;
+	PxReal* PX_RESTRICT vForceWriteback0 = reinterpret_cast<PxReal*>(desc[0].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback1 = reinterpret_cast<PxReal*>(desc[1].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback2 = reinterpret_cast<PxReal*>(desc[2].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback3 = reinterpret_cast<PxReal*>(desc[3].writeBack);
+
+	const SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+	PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+	const PxU32 pointStride = firstHeader->type == DY_SC_TYPE_BLOCK_RB_CONTACT ? sizeof(SolverContact4Dynamic)
+																	   : sizeof(SolverContact4Base);
+
+	bool writeBackThresholds[4] = {false, false, false, false};
+
+
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader4);
+
+		writeBackThresholds[0] = hdr->flags[0] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[1] = hdr->flags[1] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[2] = hdr->flags[2] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[3] = hdr->flags[3] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		Ps::prefetchLine(cPtr, 256);
+		Ps::prefetchLine(cPtr, 384);
+
+		
+		for(PxU32 i=0; i<numNormalConstr; i++)
+		{
+			SolverContact4Base* c = reinterpret_cast<SolverContact4Base*>(cPtr);
+			cPtr += pointStride;
+
+			const Vec4V appliedForce = c->appliedForce;
+			if(vForceWriteback0 && i < hdr->numNormalConstr0)
+				FStore(V4GetX(appliedForce), vForceWriteback0++);
+			if(vForceWriteback1 && i < hdr->numNormalConstr1)
+				FStore(V4GetY(appliedForce), vForceWriteback1++);
+			if(vForceWriteback2 && i < hdr->numNormalConstr2)
+				FStore(V4GetZ(appliedForce), vForceWriteback2++);
+			if(vForceWriteback3 && i < hdr->numNormalConstr3)
+				FStore(V4GetW(appliedForce), vForceWriteback3++);
+			
+			normalForceV = V4Add(normalForceV, appliedForce);
+		}
+	}
+	PX_ASSERT(cPtr == last);
+
+	PX_ALIGN(16, PxReal nf[4]);
+	V4StoreA(normalForceV, nf);
+
+	//all constraint pointer in descs are the same constraint
+	Sc::ShapeInteraction** shapeInteractions = reinterpret_cast<SolverContactCoulombHeader4*>(desc[0].constraint)->shapeInteraction;
+
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		if(writeBackThresholds[a] && desc[a].linkIndexA == PxSolverConstraintDesc::NO_LINK && desc[a].linkIndexB == PxSolverConstraintDesc::NO_LINK &&
+			nf[a] !=0.f && (bd0[a]->reportThreshold < PX_MAX_REAL  || bd1[a]->reportThreshold < PX_MAX_REAL))
+		{
+			ThresholdStreamElement elt;
+			elt.normalForce = nf[a];
+			elt.threshold = PxMin<float>(bd0[a]->reportThreshold, bd1[a]->reportThreshold);
+			elt.nodeIndexA = bd0[a]->nodeIndex;
+			elt.nodeIndexB = bd1[a]->nodeIndex;
+			elt.shapeInteraction = shapeInteractions[a];
+			Ps::order(elt.nodeIndexA, elt.nodeIndexB);
+			PX_ASSERT(elt.nodeIndexA < elt.nodeIndexB);
+			PX_ASSERT(cache.mThresholdStreamIndex<cache.mThresholdStreamLength);
+			cache.mThresholdStream[cache.mThresholdStreamIndex++] = elt;
+		}
+	}
+}
+
+void solveContactCoulombPreBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_Block(desc, cache);
+}
+
+void solveContactCoulombPreBlock_Static(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_StaticBlock(desc, cache);
+}
+
+void solveContactCoulombPreBlock_Conclude(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_Block(desc, cache);
+	concludeContactCoulomb4(desc, cache);
+}
+
+void solveContactCoulombPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_StaticBlock(desc, cache);
+	concludeContactCoulomb4(desc, cache);
+}
+
+void solveContactCoulombPreBlock_WriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_Block(desc, cache);
+
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+
+
+	writeBackContactCoulomb4(desc, cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveContactCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_StaticBlock(desc, cache);
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+	writeBackContactCoulomb4(desc, cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveFrictionCoulombPreBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_Block(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_Static(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_StaticBlock(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_Conclude(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_Block(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_StaticBlock(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_WriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_Block(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_StaticBlock(desc, cache);
+}
+
+
+}
+
+}
+
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DySpatial.h b/PhysX_3.4/Source/LowLevelDynamics/src/DySpatial.h
new file mode 100644
index 00000000..e27406b3
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DySpatial.h
@@ -0,0 +1,142 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SPATIAL_H
+#define DY_SPATIAL_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "PsMathUtils.h"
+#include "CmSpatialVector.h"
+
+namespace physx
+{
+namespace Dy
+{
+// translate a motion resolved at position p to the origin
+
+
+// should have a 'from' frame and a 'to' frame
+class SpInertia
+{
+public:
+	SpInertia() {}
+
+	SpInertia(const PxMat33& ll, const PxMat33& la, const PxMat33& aa): mLL(ll), mLA(la), mAA(aa)
+	{
+	}
+
+	static SpInertia getZero()
+	{
+		return SpInertia(PxMat33(PxZero), PxMat33(PxZero), 
+							     PxMat33(PxZero));
+	}
+
+	static SpInertia dyad(const Cm::SpatialVector& column, const Cm::SpatialVector& row) 
+	{
+		return SpInertia(dyad(column.linear, row.linear),  
+						 dyad(column.linear, row.angular),  
+					     dyad(column.angular, row.angular));
+	}
+
+
+	static SpInertia inertia(PxReal mass, const PxVec3& inertia)
+	{
+		return SpInertia(PxMat33::createDiagonal(PxVec3(mass,mass,mass)), PxMat33(PxZero),			 
+							     PxMat33::createDiagonal(inertia));
+	}
+
+
+	SpInertia operator+(const SpInertia& m) const
+	{
+		return SpInertia(mLL+m.mLL, mLA+m.mLA, mAA+m.mAA);
+	}
+
+	SpInertia operator-(const SpInertia& m) const
+	{
+		return SpInertia(mLL-m.mLL, mLA-m.mLA, mAA-m.mAA);
+	}
+
+	SpInertia operator*(PxReal r) const
+	{
+		return SpInertia(mLL*r, mLA*r, mAA*r);
+	}
+
+	void operator+=(const SpInertia& m)
+	{
+		mLL+=m.mLL; 
+		mLA+=m.mLA;		
+		mAA+=m.mAA;
+	}
+
+	void operator-=(const SpInertia& m)
+	{
+		mLL-=m.mLL; 
+		mLA-=m.mLA;		
+		mAA-=m.mAA;
+	}
+
+
+	PX_FORCE_INLINE Cm::SpatialVector operator *(const Cm::SpatialVector& v) const
+	{
+		return Cm::SpatialVector(mLL*v.linear            +mLA*v.angular,
+		 					    mLA.transformTranspose(v.linear)+mAA*v.angular);
+	}
+
+	SpInertia operator *(const SpInertia& v) const
+	{
+		return SpInertia(mLL*v.mLL             + mLA * v.mLA.getTranspose(), 
+						 mLL*v.mLA             + mLA * v.mAA,
+						 mLA.getTranspose()*v.mLA + mAA * v.mAA);
+	}
+
+
+	bool isFinite() const
+	{
+		return true;
+//		return mLL.isFinite() && mLA.isFinite() && mAA.isFinite(); 
+	}
+
+	PxMat33 mLL, mLA;		// linear force from angular motion, linear force from linear motion
+	PxMat33 mAA;		    // angular force from angular motion, mAL = mLA.transpose()
+
+private:
+	static PxMat33 dyad(PxVec3 col, PxVec3 row)	
+	{ 
+		return PxMat33(col*row.x, col*row.y, col*row.z); 
+	}
+
+
+};
+
+}
+}
+
+#endif //DY_SPATIAL_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyThreadContext.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyThreadContext.cpp
new file mode 100644
index 00000000..5526b83a
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyThreadContext.cpp
@@ -0,0 +1,110 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "DyThreadContext.h"
+#include "PsBitUtils.h"
+
+namespace physx
+{
+namespace Dy
+{
+
+ThreadContext::ThreadContext(PxcNpMemBlockPool* memBlockPool):
+	mFrictionPatchStreamPair(*memBlockPool),
+	mConstraintBlockManager	(*memBlockPool),
+	mConstraintBlockStream	(*memBlockPool),
+	mNumDifferentBodyConstraints(0),
+	mNumSelfConstraints(0),
+	mNumSelfConstraintBlocks(0),
+	mConstraintsPerPartition(PX_DEBUG_EXP("ThreadContext::mConstraintsPerPartition")),
+	mFrictionConstraintsPerPartition(PX_DEBUG_EXP("ThreadContext::frictionsConstraintsPerPartition")),
+	mPartitionNormalizationBitmap(PX_DEBUG_EXP("ThreadContext::mPartitionNormalizationBitmap")),
+	frictionConstraintDescArray(PX_DEBUG_EXP("ThreadContext::solverFrictionConstraintArray")),
+	frictionConstraintBatchHeaders(PX_DEBUG_EXP("ThreadContext::frictionConstraintBatchHeaders")),
+	compoundConstraints(PX_DEBUG_EXP("ThreadContext::compoundConstraints")),
+	orderedContactList(PX_DEBUG_EXP("ThreadContext::orderedContactList")),
+	tempContactList(PX_DEBUG_EXP("ThreadContext::tempContactList")),
+	sortIndexArray(PX_DEBUG_EXP("ThreadContext::sortIndexArray")),
+	mConstraintSize			(0),
+	mAxisConstraintCount(0),
+	mSelfConstraintBlocks(NULL),
+	mMaxPartitions(0),
+	mMaxSolverPositionIterations(0),
+	mMaxSolverVelocityIterations(0),
+	mMaxArticulationLength(0),
+	mContactDescPtr(NULL),
+	mFrictionDescPtr(NULL),
+	mArticulations(PX_DEBUG_EXP("ThreadContext::articulations"))
+	
+{
+#if PX_ENABLE_SIM_STATS
+	mThreadSimStats.clear();
+#endif
+	//Defaulted to have space for 16384 bodies
+	mPartitionNormalizationBitmap.reserve(512); 
+	//Defaulted to have space for 128 partitions (should be more-than-enough)
+	mConstraintsPerPartition.reserve(128);
+}
+
+void ThreadContext::resizeArrays(PxU32 frictionConstraintDescCount, PxU32 articulationCount)
+{
+	// resize resizes smaller arrays to the exact target size, which can generate a lot of churn
+	frictionConstraintDescArray.forceSize_Unsafe(0);
+	frictionConstraintDescArray.reserve((frictionConstraintDescCount+63)&~63);
+
+	mArticulations.forceSize_Unsafe(0);
+	mArticulations.reserve(PxMax<PxU32>(Ps::nextPowerOfTwo(articulationCount), 16));
+	mArticulations.forceSize_Unsafe(articulationCount);
+
+	mContactDescPtr = contactConstraintDescArray;
+	mFrictionDescPtr = frictionConstraintDescArray.begin();
+}
+
+void ThreadContext::reset()
+{
+	// TODO: move these to the PxcNpThreadContext
+	mFrictionPatchStreamPair.reset();
+	mConstraintBlockStream.reset();
+
+	mContactDescPtr = contactConstraintDescArray;
+	mFrictionDescPtr = frictionConstraintDescArray.begin();
+
+	mAxisConstraintCount = 0;
+	mMaxSolverPositionIterations = 0;
+	mMaxSolverVelocityIterations = 0;
+	mNumDifferentBodyConstraints = 0;
+	mNumSelfConstraints = 0;
+	mSelfConstraintBlocks = NULL;
+	mNumSelfConstraintBlocks = 0;
+	mConstraintSize = 0;
+}
+
+}
+} 
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyThreadContext.h b/PhysX_3.4/Source/LowLevelDynamics/src/DyThreadContext.h
new file mode 100644
index 00000000..a958ac23
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyThreadContext.h
@@ -0,0 +1,203 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_THREADCONTEXT_H
+#define DY_THREADCONTEXT_H
+
+#include "foundation/PxTransform.h"
+#include "PxvConfig.h"
+#include "CmBitMap.h"
+#include "CmMatrix34.h"
+#include "PxcThreadCoherentCache.h"
+#include "DyThresholdTable.h"
+#include "PsAllocator.h"
+#include "PsAllocator.h"
+#include "GuContactBuffer.h"
+#include "DySolverConstraintDesc.h"
+#include "PxvDynamics.h"
+#include "DyArticulation.h"
+#include "DyFrictionPatchStreamPair.h"
+#include "PxcConstraintBlockStream.h"
+#include "DyCorrelationBuffer.h"
+
+namespace physx
+{
+struct PxsIndexedContactManager;
+
+namespace Dy
+{
+
+/*!
+Cache information specific to the software implementation(non common).
+
+See PxcgetThreadContext.
+
+Not thread-safe, so remember to have one object per thread!
+
+TODO! refactor this and rename(it is a general per thread cache). Move transform cache into its own class.
+*/
+class ThreadContext : 
+	public PxcThreadCoherentCache<ThreadContext, PxcNpMemBlockPool>::EntryBase
+{
+	PX_NOCOPY(ThreadContext)
+public:
+
+#if PX_ENABLE_SIM_STATS
+	struct ThreadSimStats
+	{
+		void clear()
+		{
+
+			numActiveConstraints = 0;
+			numActiveDynamicBodies = 0;
+			numActiveKinematicBodies = 0;
+			numAxisSolverConstraints = 0;
+
+		}
+
+		PxU32 numActiveConstraints;
+		PxU32 numActiveDynamicBodies;
+		PxU32 numActiveKinematicBodies;
+		PxU32 numAxisSolverConstraints;
+
+	};
+#endif
+
+	//TODO: tune cache size based on number of active objects.
+	ThreadContext(PxcNpMemBlockPool* memBlockPool);
+	void reset();
+	void resizeArrays(PxU32 frictionConstraintDescCount, PxU32 articulationCount);
+
+	PX_FORCE_INLINE	Ps::Array<ArticulationSolverDesc>&		getArticulations()								{ return mArticulations;					}
+
+
+#if PX_ENABLE_SIM_STATS
+	PX_FORCE_INLINE ThreadSimStats& getSimStats()
+	{
+		return mThreadSimStats;
+	}
+#endif
+
+	Gu::ContactBuffer mContactBuffer;
+
+		// temporary buffer for correlation
+	PX_ALIGN(16, CorrelationBuffer			mCorrelationBuffer); 
+
+	FrictionPatchStreamPair		mFrictionPatchStreamPair;	// patch streams
+
+	PxsConstraintBlockManager		mConstraintBlockManager;	// for when this thread context is "lead" on an island
+	PxcConstraintBlockStream 		mConstraintBlockStream;		// constraint block pool
+
+
+	// this stuff is just used for reformatting the solver data. Hopefully we should have a more
+	// sane format for this when the dust settles - so it's just temporary. If we keep this around
+	// here we should move these from public to private
+
+	PxU32 mNumDifferentBodyConstraints;
+	PxU32 mNumDifferentBodyFrictionConstraints;
+	PxU32 mNumSelfConstraints;
+	PxU32 mNumSelfFrictionConstraints;
+	PxU32 mNumSelfConstraintBlocks;
+	PxU32 mNumSelfConstraintFrictionBlocks;
+
+	Ps::Array<PxU32>					mConstraintsPerPartition;
+	Ps::Array<PxU32>					mFrictionConstraintsPerPartition;
+	Ps::Array<PxU32>					mPartitionNormalizationBitmap;
+	PxsBodyCore**						mBodyCoreArray;
+	PxsRigidBody**						mRigidBodyArray;
+	Articulation**						mArticulationArray;
+	Cm::SpatialVector*					motionVelocityArray;
+	PxU32*								bodyRemapTable;
+	PxU32*								mNodeIndexArray;
+
+	//Constraint info for normal constraint sovler
+	PxSolverConstraintDesc*			contactConstraintDescArray;
+	PxU32								contactDescArraySize;
+	PxSolverConstraintDesc*			orderedContactConstraints;
+	PxConstraintBatchHeader*			contactConstraintBatchHeaders;
+	PxU32								numContactConstraintBatches;
+
+	//Constraint info for partitioning
+	PxSolverConstraintDesc*			tempConstraintDescArray;
+
+	//Additional constraint info for 1d/2d friction model
+	Ps::Array<PxSolverConstraintDesc>	frictionConstraintDescArray;
+	Ps::Array<PxConstraintBatchHeader> frictionConstraintBatchHeaders;
+
+	//Info for tracking compound contact managers (temporary data - could use scratch memory!)
+	Ps::Array<CompoundContactManager> compoundConstraints;
+
+	//Used for sorting constraints. Temporary, could use scratch memory
+	Ps::Array<const PxsIndexedContactManager*> orderedContactList;
+	Ps::Array<const PxsIndexedContactManager*> tempContactList;
+	Ps::Array<PxU32>					sortIndexArray;
+
+	PxU32								numDifferentBodyBatchHeaders;
+	PxU32								numSelfConstraintBatchHeaders;
+
+	
+	PxU32								mOrderedContactDescCount;
+	PxU32								mOrderedFrictionDescCount;
+
+	PxU32								mConstraintSize;
+
+	PxU32 mAxisConstraintCount;
+	SelfConstraintBlock* mSelfConstraintBlocks;
+	
+	SelfConstraintBlock* mSelfConstraintFrictionBlocks;
+
+	PxU32 mMaxPartitions;
+	PxU32 mMaxFrictionPartitions;
+	PxU32 mMaxSolverPositionIterations;
+	PxU32 mMaxSolverVelocityIterations;
+	PxU32 mMaxArticulationLength;
+	PxU32 mMaxArticulationSolverLength;
+	
+	PxSolverConstraintDesc* mContactDescPtr;
+	PxSolverConstraintDesc* mStartContactDescPtr;
+	PxSolverConstraintDesc* mFrictionDescPtr;
+
+private:
+
+	Ps::Array<ArticulationSolverDesc>	mArticulations;
+
+#if PX_ENABLE_SIM_STATS
+	ThreadSimStats				mThreadSimStats;
+#endif
+
+	public:
+
+};
+
+}
+
+}
+
+#endif //DY_THREADCONTEXT_H
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyThresholdTable.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyThresholdTable.cpp
new file mode 100644
index 00000000..b7b613f6
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyThresholdTable.cpp
@@ -0,0 +1,68 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxMemory.h"
+#include "DyThresholdTable.h"
+#include "PsHash.h"
+#include "PsUtilities.h"
+#include "PsAllocator.h"
+
+namespace physx
+{
+	namespace Dy
+	{
+		bool ThresholdTable::check(const ThresholdStream& stream, const PxU32 nodeIndexA, const PxU32 nodeIndexB, PxReal dt)
+		{
+			PxU32* PX_RESTRICT hashes = mHash;
+			PxU32* PX_RESTRICT nextIndices = mNexts;
+			Pair* PX_RESTRICT pairs = mPairs;
+
+			/*const PxsRigidBody* b0 = PxMin(body0, body1);
+			const PxsRigidBody* b1 = PxMax(body0, body1);*/
+
+			const PxU32 nA = PxMin(nodeIndexA, nodeIndexB);
+			const PxU32 nB = PxMax(nodeIndexA, nodeIndexB);
+
+			PxU32 hashKey = computeHashKey(nodeIndexA, nodeIndexB, mHashSize);
+
+			PxU32 pairIndex = hashes[hashKey];
+			while(NO_INDEX != pairIndex)
+			{
+				Pair& pair = pairs[pairIndex];
+				const PxU32 thresholdStreamIndex = pair.thresholdStreamIndex;
+				PX_ASSERT(thresholdStreamIndex < stream.size());
+				const ThresholdStreamElement& otherElement = stream[thresholdStreamIndex];
+				if(otherElement.nodeIndexA==nA && otherElement.nodeIndexB==nB)
+					return (pair.accumulatedForce > (otherElement.threshold * dt));
+				pairIndex = nextIndices[pairIndex];
+			}
+			return false;
+		}
+	}
+}
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelDynamics/src
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip