1 files changed, 594 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetup.cpp b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetup.cpp
new file mode 100644
index 00000000..c5777c12
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetup.cpp
@@ -0,0 +1,594 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxMemory.h"
+#include "DyConstraintPrep.h"
+#include "PxsRigidBody.h"
+#include "DySolverConstraint1D.h"
+#include "PsSort.h"
+#include "DySolverConstraintDesc.h"
+#include "PxcConstraintBlockStream.h"
+#include "DyArticulationContactPrep.h"
+#include "PsFoundation.h"
+
+namespace physx
+{
+namespace Dy
+{
+	// dsequeira:
+	//
+	// we can choose any linear combination of equality constraints and get the same solution
+	// Hence we can orthogonalize the constraints using the inner product given by the
+	// inverse mass matrix, so that when we use PGS, solving a constraint row for a joint 
+	// don't disturb the solution of prior rows.
+	//
+	// We also eliminate the equality constraints from the hard inequality constraints - 
+	// (essentially projecting the direction corresponding to the lagrange multiplier 
+	// onto the equality constraint subspace) but 'til I've verified this generates 
+	// exactly the same KKT/complementarity conditions, status is 'experimental'. 	
+	//
+	// since for equality constraints the resulting rows have the property that applying
+	// an impulse along one row doesn't alter the projected velocity along another row, 
+	// all equality constraints (plus one inequality constraint) can be processed in parallel
+	// using SIMD
+	//
+	// Eliminating the inequality constraints from each other would require a solver change
+	// and not give us any more parallelism, although we might get better convergence.
+
+namespace
+{
+	PX_FORCE_INLINE Vec3V V3FromV4(Vec4V x)			{ return Vec3V_From_Vec4V(x); }
+	PX_FORCE_INLINE Vec3V V3FromV4Unsafe(Vec4V x)	{ return Vec3V_From_Vec4V_WUndefined(x); }
+	PX_FORCE_INLINE Vec4V V4FromV3(Vec3V x)			{ return Vec4V_From_Vec3V(x); }
+	//PX_FORCE_INLINE Vec4V V4ClearW(Vec4V x)			{ return V4SetW(x, FZero()); }
+
+struct MassProps
+{
+	FloatV invMass0;
+	FloatV invMass1;
+	FloatV invInertiaScale0;
+	FloatV invInertiaScale1;
+
+	PX_FORCE_INLINE MassProps(const PxSolverBodyData& bd0,
+							  const PxSolverBodyData& bd1,
+							  const PxConstraintInvMassScale& ims)
+	:	
+		invMass0(FLoad(bd0.invMass * ims.linear0))
+	,	invMass1(FLoad(bd1.invMass * ims.linear1))
+	,	invInertiaScale0(FLoad(ims.angular0))
+	,	invInertiaScale1(FLoad(ims.angular1))
+	{}
+};
+
+
+PX_FORCE_INLINE PxReal innerProduct(const Px1DConstraint& row0, Px1DConstraint& row1, 
+								 PxVec4& row0AngSqrtInvInertia0, PxVec4& row0AngSqrtInvInertia1, 
+								 PxVec4& row1AngSqrtInvInertia0, PxVec4& row1AngSqrtInvInertia1, const MassProps& m)
+{
+	const Vec3V l0 = V3Mul(V3Scale(V3LoadA(row0.linear0), m.invMass0), V3LoadA(row1.linear0));
+	const Vec3V l1 = V3Mul(V3Scale(V3LoadA(row0.linear1), m.invMass1), V3LoadA(row1.linear1));
+	Vec4V r0ang0 = V4LoadA(&row0AngSqrtInvInertia0.x);
+	Vec4V r1ang0 = V4LoadA(&row1AngSqrtInvInertia0.x);
+	Vec4V r0ang1 = V4LoadA(&row0AngSqrtInvInertia1.x);
+	Vec4V r1ang1 = V4LoadA(&row1AngSqrtInvInertia1.x);
+
+	const Vec3V i0 = V3ScaleAdd(V3Mul(Vec3V_From_Vec4V(r0ang0), Vec3V_From_Vec4V(r1ang0)), m.invInertiaScale0, l0);
+	const Vec3V i1 = V3ScaleAdd(V3MulAdd(Vec3V_From_Vec4V(r0ang1), Vec3V_From_Vec4V(r1ang1), i0), m.invInertiaScale1, l1);
+	PxF32 f;
+	FStore(V3SumElems(i1), &f);
+	return f;
+}
+
+
+// indexed rotation around axis, with sine and cosine of half-angle
+PX_FORCE_INLINE PxQuat indexedRotation(PxU32 axis, PxReal s, PxReal c)
+{
+	PxQuat q(0,0,0,c);
+	reinterpret_cast<PxReal*>(&q)[axis] = s;
+	return q;
+}
+
+PxQuat diagonalize(const PxMat33& m)	// jacobi rotation using quaternions 
+{
+	const PxU32 MAX_ITERS = 5;
+
+	PxQuat q = PxQuat(PxIdentity);
+
+	PxMat33 d;
+	for(PxU32 i=0; i < MAX_ITERS;i++)
+	{
+		const PxMat33 axes(q);
+		d = axes.getTranspose() * m * axes;
+
+		const PxReal d0 = PxAbs(d[1][2]), d1 = PxAbs(d[0][2]), d2 = PxAbs(d[0][1]);
+		const PxU32 a = PxU32(d0 > d1 && d0 > d2 ? 0 : d1 > d2 ? 1 : 2);						// rotation axis index, from largest off-diagonal element
+
+		const PxU32 a1 = Ps::getNextIndex3(a), a2 = Ps::getNextIndex3(a1);											
+		if(d[a1][a2] == 0.0f || PxAbs(d[a1][a1]-d[a2][a2]) > 2e6f*PxAbs(2.0f*d[a1][a2]))
+			break;
+
+		const PxReal w = (d[a1][a1]-d[a2][a2]) / (2.0f*d[a1][a2]);					// cot(2 * phi), where phi is the rotation angle
+		const PxReal absw = PxAbs(w);
+
+		PxQuat r;
+		if(absw>1000)
+			r = indexedRotation(a, 1.0f/(4.0f*w), 1.f);									// h will be very close to 1, so use small angle approx instead
+		else
+		{
+  			const PxReal t = 1 / (absw + PxSqrt(w*w+1));								// absolute value of tan phi
+			const PxReal h = 1 / PxSqrt(t*t+1);										// absolute value of cos phi
+
+			PX_ASSERT(h!=1);													// |w|<1000 guarantees this with typical IEEE754 machine eps (approx 6e-8)
+			r = indexedRotation(a, PxSqrt((1-h)/2) * PxSign(w), PxSqrt((1+h)/2));
+		}
+	
+		q = (q*r).getNormalized();
+	}
+
+	return q;
+}
+
+
+PX_FORCE_INLINE void rescale(const Mat33V& m, PxVec3& a0, PxVec3& a1, PxVec3& a2)
+{
+	const Vec3V va0 = V3LoadU(a0);
+	const Vec3V va1 = V3LoadU(a1);
+	const Vec3V va2 = V3LoadU(a2);
+
+	const Vec3V b0 = V3ScaleAdd(va0, V3GetX(m.col0), V3ScaleAdd(va1, V3GetY(m.col0), V3Scale(va2, V3GetZ(m.col0))));
+	const Vec3V b1 = V3ScaleAdd(va0, V3GetX(m.col1), V3ScaleAdd(va1, V3GetY(m.col1), V3Scale(va2, V3GetZ(m.col1))));
+	const Vec3V b2 = V3ScaleAdd(va0, V3GetX(m.col2), V3ScaleAdd(va1, V3GetY(m.col2), V3Scale(va2, V3GetZ(m.col2))));
+
+	V3StoreU(b0, a0);
+	V3StoreU(b1, a1);
+	V3StoreU(b2, a2);
+}
+
+PX_FORCE_INLINE void rescale4(const Mat33V& m, PxReal* a0, PxReal* a1, PxReal* a2)
+{
+	const Vec4V va0 = V4LoadA(a0);
+	const Vec4V va1 = V4LoadA(a1);
+	const Vec4V va2 = V4LoadA(a2);
+
+	const Vec4V b0 = V4ScaleAdd(va0, V3GetX(m.col0), V4ScaleAdd(va1, V3GetY(m.col0), V4Scale(va2, V3GetZ(m.col0))));
+	const Vec4V b1 = V4ScaleAdd(va0, V3GetX(m.col1), V4ScaleAdd(va1, V3GetY(m.col1), V4Scale(va2, V3GetZ(m.col1))));
+	const Vec4V b2 = V4ScaleAdd(va0, V3GetX(m.col2), V4ScaleAdd(va1, V3GetY(m.col2), V4Scale(va2, V3GetZ(m.col2))));
+
+	V4StoreA(b0, a0);
+	V4StoreA(b1, a1);
+	V4StoreA(b2, a2);
+}
+
+
+template<typename T>
+PX_FORCE_INLINE void rescale(const PxMat33& m, T& a0, T& a1, T& a2)
+{
+	T b0 = a0*m(0,0) + a1 * m(1,0) + a2 * m(2,0);
+	T b1 = a0*m(0,1) + a1 * m(1,1) + a2 * m(2,1);
+	T b2 = a0*m(0,2) + a1 * m(1,2) + a2 * m(2,2);
+
+	a0 = b0;
+	a1 = b1;
+	a2 = b2;
+}
+
+void diagonalize(Px1DConstraint** row,
+				 PxVec4* angSqrtInvInertia0,
+				 PxVec4* angSqrtInvInertia1,
+				 const MassProps &m)
+{
+	PxReal a00 = innerProduct(*row[0], *row[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], m);
+	PxReal a01 = innerProduct(*row[0], *row[1], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
+	PxReal a02 = innerProduct(*row[0], *row[2], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+	PxReal a11 = innerProduct(*row[1], *row[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
+	PxReal a12 = innerProduct(*row[1], *row[2], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+	PxReal a22 = innerProduct(*row[2], *row[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+
+	PxMat33 a(PxVec3(a00, a01, a02),
+			  PxVec3(a01, a11, a12),
+			  PxVec3(a02, a12, a22));
+
+	PxQuat q = diagonalize(a);
+
+	PxMat33 n(-q);
+
+	Mat33V mn(V3LoadU(n.column0), V3LoadU(n.column1), V3LoadU(n.column2));
+
+	//KS - We treat as a Vec4V so that we get geometricError rescaled for free along with linear0
+	rescale4(mn, &row[0]->linear0.x, &row[1]->linear0.x, &row[2]->linear0.x);
+	rescale(mn, row[0]->linear1, row[1]->linear1, row[2]->linear1);
+	//KS - We treat as a PxVec4 so that we get velocityTarget rescaled for free 
+	rescale4(mn, &row[0]->angular0.x, &row[1]->angular0.x, &row[2]->angular0.x);
+	rescale(mn, row[0]->angular1, row[1]->angular1, row[2]->angular1);
+	rescale4(mn, &angSqrtInvInertia0[0].x, &angSqrtInvInertia0[1].x, &angSqrtInvInertia0[2].x);
+	rescale4(mn, &angSqrtInvInertia1[0].x, &angSqrtInvInertia1[1].x, &angSqrtInvInertia1[2].x);
+	
+}
+
+void orthogonalize(Px1DConstraint** row,
+				   PxVec4* angSqrtInvInertia0,
+				   PxVec4* angSqrtInvInertia1,
+				   PxU32 rowCount,
+				   PxU32 eqRowCount,
+				   const MassProps &m)
+{
+	PX_ASSERT(eqRowCount<=6);
+
+	const FloatV zero = FZero();
+
+	Vec3V lin1m[6], ang1m[6], lin1[6], ang1[6];	
+	Vec4V lin0m[6], ang0m[6];			// must have 0 in the W-field
+	Vec4V lin0AndG[6], ang0AndT[6];
+
+	for(PxU32 i=0;i<rowCount;i++)
+	{
+		Vec4V l0AndG = V4LoadA(&row[i]->linear0.x);		// linear0 and geometric error
+		Vec4V a0AndT = V4LoadA(&row[i]->angular0.x);	// angular0 and velocity target
+
+		Vec3V l1 = V3FromV4(V4LoadA(&row[i]->linear1.x));
+		Vec3V a1 = V3FromV4(V4LoadA(&row[i]->angular1.x));
+
+		Vec4V angSqrtL0 = V4LoadA(&angSqrtInvInertia0[i].x);
+		Vec4V angSqrtL1 = V4LoadA(&angSqrtInvInertia1[i].x);
+
+		PxU32 eliminationRows = PxMin<PxU32>(i, eqRowCount);
+		for(PxU32 j=0;j<eliminationRows;j++)
+		{
+			const Vec3V s0 = V3MulAdd(l1, lin1m[j], V3FromV4Unsafe(V4Mul(l0AndG, lin0m[j])));
+			const Vec3V s1 = V3MulAdd(V3FromV4Unsafe(angSqrtL1), ang1m[j], V3FromV4Unsafe(V4Mul(angSqrtL0, ang0m[j])));
+			FloatV t = V3SumElems(V3Add(s0, s1));
+
+			l0AndG = V4NegScaleSub(lin0AndG[j], t, l0AndG);
+			a0AndT = V4NegScaleSub(ang0AndT[j], t, a0AndT);
+			l1 = V3NegScaleSub(lin1[j], t, l1);
+			a1 = V3NegScaleSub(ang1[j], t, a1);
+			angSqrtL0 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia0[j].x), t, angSqrtL0);
+			angSqrtL1 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia1[j].x), t, angSqrtL1);
+		}
+
+		V4StoreA(l0AndG, &row[i]->linear0.x);
+		V4StoreA(a0AndT, &row[i]->angular0.x);
+		V3StoreA(l1, row[i]->linear1);
+		V3StoreA(a1, row[i]->angular1);
+		V4StoreA(angSqrtL0, &angSqrtInvInertia0[i].x);
+		V4StoreA(angSqrtL1, &angSqrtInvInertia1[i].x);
+
+		if(i<eqRowCount)
+		{
+			lin0AndG[i] = l0AndG;	
+			ang0AndT[i] = a0AndT;
+			lin1[i] = l1;	
+			ang1[i] = a1;	
+			
+			const Vec3V l0 = V3FromV4(l0AndG);
+
+			const Vec3V l0m = V3Scale(l0, m.invMass0);
+			const Vec3V l1m = V3Scale(l1, m.invMass1);
+			const Vec4V a0m = V4Scale(angSqrtL0, m.invInertiaScale0);
+			const Vec4V a1m = V4Scale(angSqrtL1, m.invInertiaScale1);
+
+			const Vec3V s0 = V3MulAdd(l0, l0m, V3Mul(l1, l1m));
+			const Vec4V s1 = V4MulAdd(a0m, angSqrtL0, V4Mul(a1m, angSqrtL1));
+			const FloatV s = V3SumElems(V3Add(s0, V3FromV4Unsafe(s1)));
+			const FloatV a = FSel(FIsGrtr(s, zero), FRecip(s), zero);	// with mass scaling, it's possible for the inner product of a row to be zero
+
+			lin0m[i] = V4Scale(V4ClearW(V4FromV3(l0m)), a);	
+			ang0m[i] = V4Scale(V4ClearW(a0m), a);
+			lin1m[i] = V3Scale(l1m, a);
+			ang1m[i] = V3Scale(V3FromV4Unsafe(a1m), a);
+		}
+	}
+}
+}
+
+
+void preprocessRows(Px1DConstraint** sorted, 
+					Px1DConstraint* rows,
+					PxVec4* angSqrtInvInertia0,
+					PxVec4* angSqrtInvInertia1,
+					PxU32 rowCount,
+					const PxSolverBodyData& bd0,
+					const PxSolverBodyData& bd1,
+					const PxConstraintInvMassScale& ims,
+					bool disablePreprocessing,
+					bool diagonalizeDrive)
+{
+	// j is maxed at 12, typically around 7, so insertion sort is fine
+	for(PxU32 i=0; i<rowCount; i++)
+	{
+		Px1DConstraint* r = rows+i;
+		
+		PxU32 j = i;
+		for(;j>0 && r->solveHint < sorted[j-1]->solveHint; j--)
+			sorted[j] = sorted[j-1];
+
+		sorted[j] = r;
+	}
+
+	for(PxU32 i=0;i<rowCount-1;i++)
+		PX_ASSERT(sorted[i]->solveHint <= sorted[i+1]->solveHint);
+
+	for (PxU32 i = 0; i<rowCount; i++)
+		rows[i].forInternalUse = rows[i].flags & Px1DConstraintFlag::eKEEPBIAS ? rows[i].geometricError : 0;
+
+
+	const Mat33V sqrtInvInertia0 = Mat33V(V3LoadU(bd0.sqrtInvInertia.column0), V3LoadU(bd0.sqrtInvInertia.column1),
+		V3LoadU(bd0.sqrtInvInertia.column2));
+
+	const Mat33V sqrtInvInertia1 = Mat33V(V3LoadU(bd1.sqrtInvInertia.column0), V3LoadU(bd1.sqrtInvInertia.column1),
+		V3LoadU(bd1.sqrtInvInertia.column2));
+
+	PX_ASSERT(((uintptr_t(angSqrtInvInertia0)) & 0xF) == 0);
+	PX_ASSERT(((uintptr_t(angSqrtInvInertia1)) & 0xF) == 0);
+
+	for(PxU32 i = 0; i < rowCount; ++i)
+	{
+		const Vec3V angDelta0 = M33MulV3(sqrtInvInertia0, V3LoadU(sorted[i]->angular0));
+		const Vec3V angDelta1 = M33MulV3(sqrtInvInertia1, V3LoadU(sorted[i]->angular1));
+		V4StoreA(Vec4V_From_Vec3V(angDelta0), &angSqrtInvInertia0[i].x);
+		V4StoreA(Vec4V_From_Vec3V(angDelta1), &angSqrtInvInertia1[i].x);
+	}
+
+	if(disablePreprocessing)
+		return;
+
+	MassProps m(bd0, bd1, ims);
+	for(PxU32 i=0;i<rowCount;)
+	{
+		const PxU32 groupMajorId = PxU32(sorted[i]->solveHint>>8), start = i++;
+		while(i<rowCount && PxU32(sorted[i]->solveHint>>8) == groupMajorId)
+			i++;
+
+		if(groupMajorId == 4)
+		{
+			PxU32 bCount = start;		// count of bilateral constraints 
+			for(; bCount<i && (sorted[bCount]->solveHint&255)==0; bCount++)
+				;
+			orthogonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, i-start, bCount-start, m);
+		}
+
+		if(groupMajorId == 1 && diagonalizeDrive)
+		{			
+			PxU32 slerp = start;		// count of bilateral constraints 
+			for(; slerp<i && (sorted[slerp]->solveHint&255)!=2; slerp++)
+				;
+			if(slerp+3 == i)
+				diagonalize(sorted+slerp, angSqrtInvInertia0+slerp, angSqrtInvInertia1+slerp, m);
+
+			PX_ASSERT(i-start==3);
+			diagonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, m);
+		}
+	}
+}
+
+
+
+
+
+PxU32 ConstraintHelper::setupSolverConstraint(
+PxSolverConstraintPrepDesc& prepDesc,
+PxConstraintAllocator& allocator,
+PxReal dt, PxReal invdt)
+{
+	if (prepDesc .numRows== 0)
+		return 0;
+
+	PxSolverConstraintDesc& desc = *prepDesc.desc;
+
+	bool isExtended = desc.linkIndexA != PxSolverConstraintDesc::NO_LINK
+		|| desc.linkIndexB != PxSolverConstraintDesc::NO_LINK;
+
+	PxU32 stride = isExtended ? sizeof(SolverConstraint1DExt) : sizeof(SolverConstraint1D);
+	const PxU32 constraintLength = sizeof(SolverConstraint1DHeader) + stride * prepDesc.numRows;
+	
+	//KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we
+	//know SPU DMAs have completed)
+	PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u);
+	if(NULL == ptr || (reinterpret_cast<PxU8*>(-1))==ptr)
+	{
+		if(NULL==ptr)
+		{
+			PX_WARN_ONCE(
+				"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+				"Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			return 0;
+		}
+		else
+		{
+			PX_WARN_ONCE(
+				"Attempting to allocate more than 16K of constraint data. "
+				"Either accept joints detaching/exploding or simplify constraints.");
+			ptr=NULL;
+			return 0;
+		}
+	}
+	desc.constraint = ptr;
+
+	setConstraintLength(desc,constraintLength);
+
+	desc.writeBack = prepDesc.writeback;
+	setWritebackLength(desc, sizeof(ConstraintWriteback));
+
+	memset(desc.constraint, 0, constraintLength);
+
+	SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
+	PxU8* constraints = desc.constraint + sizeof(SolverConstraint1DHeader);
+	init(*header, Ps::to8(prepDesc.numRows), isExtended, prepDesc.mInvMassScales);
+	header->body0WorldOffset = prepDesc.body0WorldOffset;
+	header->linBreakImpulse = prepDesc.linBreakForce * dt;
+	header->angBreakImpulse = prepDesc.angBreakForce * dt;
+	header->breakable = PxU8((prepDesc.linBreakForce != PX_MAX_F32) || (prepDesc.angBreakForce != PX_MAX_F32));
+	header->invMass0D0 = prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0;
+	header->invMass1D1 = prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1;
+
+
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS];
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS];
+	
+	Px1DConstraint* sorted[MAX_CONSTRAINT_ROWS];
+
+	preprocessRows(sorted, prepDesc.rows, angSqrtInvInertia0, angSqrtInvInertia1, prepDesc.numRows, *prepDesc.data0, *prepDesc.data1, prepDesc.mInvMassScales,
+		isExtended || prepDesc.disablePreprocessing, prepDesc.improvedSlerp);
+
+	const PxReal erp = 1.0f;
+	for (PxU32 i = 0; i<prepDesc.numRows; i++)
+	{
+		Ps::prefetchLine(constraints, 128);
+		SolverConstraint1D &s = *reinterpret_cast<SolverConstraint1D *>(constraints);
+		Px1DConstraint& c = *sorted[i];
+
+		PxReal driveScale = c.flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && prepDesc.driveLimitsAreForces ? PxMin(dt, 1.0f) : 1.0f;
+
+		PxReal unitResponse;
+		PxReal normalVel = 0.0f;
+		PxReal initVel = 0.f;
+
+		if(!isExtended)
+		{
+			init(s, c.linear0, c.linear1, PxVec3(angSqrtInvInertia0[i].x, angSqrtInvInertia0[i].y, angSqrtInvInertia0[i].z),
+				PxVec3(angSqrtInvInertia1[i].x, angSqrtInvInertia1[i].y, angSqrtInvInertia1[i].z), c.minImpulse * driveScale, c.maxImpulse * driveScale);
+			s.ang0Writeback = c.angular0;
+			PxReal resp0 = s.lin0.magnitudeSquared() * prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0 + s.ang0.magnitudeSquared() * prepDesc.mInvMassScales.angular0;
+			PxReal resp1 = s.lin1.magnitudeSquared() * prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1 + s.ang1.magnitudeSquared() * prepDesc.mInvMassScales.angular1;
+			unitResponse = resp0 + resp1;
+			initVel = normalVel = prepDesc.data0->projectVelocity(c.linear0, c.angular0) - prepDesc.data1->projectVelocity(c.linear1, c.angular1);
+		}
+		else
+		{
+			init(s, c.linear0, c.linear1, c.angular0, c.angular1, c.minImpulse * driveScale, c.maxImpulse * driveScale);
+			SolverConstraint1DExt& e = static_cast<SolverConstraint1DExt&>(s);
+
+			const SolverExtBody eb0(reinterpret_cast<const void*>(prepDesc.body0), prepDesc.data0, desc.linkIndexA);
+			const SolverExtBody eb1(reinterpret_cast<const void*>(prepDesc.body1), prepDesc.data1, desc.linkIndexB);
+
+			const Cm::SpatialVector resp0 = createImpulseResponseVector(e.lin0, e.ang0, eb0);
+			const Cm::SpatialVector resp1 = createImpulseResponseVector(-e.lin1, -e.ang1, eb1);
+			unitResponse = getImpulseResponse(eb0, resp0, unsimdRef(e.deltaVA), prepDesc.mInvMassScales.linear0, prepDesc.mInvMassScales.angular0,
+				eb1, resp1, unsimdRef(e.deltaVB), prepDesc.mInvMassScales.linear1, prepDesc.mInvMassScales.angular1, true);
+
+			s.ang0Writeback = c.angular0;
+			s.lin0 = resp0.linear;
+			s.ang0 = resp0.angular;
+			s.lin1 = -resp1.linear;
+			s.ang1 = -resp1.angular;
+			PxReal vel0, vel1;
+			if(needsNormalVel(c) || eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK || eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+			{
+				vel0 = eb0.projectVelocity(c.linear0, c.angular0);
+				vel1 = eb1.projectVelocity(c.linear1, c.angular1);
+
+				normalVel = vel0 - vel1;
+
+				//normalVel = eb0.projectVelocity(s.lin0, s.ang0) - eb1.projectVelocity(s.lin1, s.ang1);
+				if(eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+					initVel = vel0;
+				else if(eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+					initVel = -vel1;
+
+			}
+		}
+
+		setSolverConstants(s.constant, s.unbiasedConstant, s.velMultiplier, s.impulseMultiplier, 
+			c, normalVel, unitResponse, prepDesc.minResponseThreshold, erp, dt, invdt);
+
+		//s.targetVelocity = initVel;
+		const PxReal velBias = initVel * s.velMultiplier;
+		s.constant += velBias;
+		s.unbiasedConstant += velBias;
+
+		if(c.flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+			s.flags |= DY_SC_FLAG_OUTPUT_FORCE;
+
+		constraints += stride;
+	}
+
+	//KS - Set the solve count at the end to 0 
+	*(reinterpret_cast<PxU32*>(constraints)) = 0;
+	*(reinterpret_cast<PxU32*>(constraints + 4)) = 0;
+	PX_ASSERT(desc.constraint + getConstraintLength(desc) == constraints);
+	return prepDesc.numRows;
+}
+
+PxU32 SetupSolverConstraint(SolverConstraintShaderPrepDesc& shaderDesc,
+	PxSolverConstraintPrepDesc& prepDesc,
+	PxConstraintAllocator& allocator,
+	PxReal dt, PxReal invdt)
+{
+	// LL shouldn't see broken constraints
+	
+	PX_ASSERT(!(reinterpret_cast<ConstraintWriteback*>(prepDesc.writeback)->broken));
+
+	setConstraintLength(*prepDesc.desc, 0);
+
+	if (!shaderDesc.solverPrep)
+		return 0;
+
+	//PxU32 numAxisConstraints = 0;
+
+	Px1DConstraint rows[MAX_CONSTRAINT_ROWS];
+
+	// This is necessary so that there will be sensible defaults and shaders will
+	// continue to work (albeit with a recompile) if the row format changes.
+	// It's a bit inefficient because it fills in all constraint rows even if there
+	// is only going to be one generated. A way around this would be for the shader to
+	// specify the maximum number of rows it needs, or it could call a subroutine to
+	// prep the row before it starts filling it it.
+
+	PxMemZero(rows, sizeof(Px1DConstraint)*MAX_CONSTRAINT_ROWS);
+
+	for (PxU32 i = 0; i<MAX_CONSTRAINT_ROWS; i++)
+	{
+		Px1DConstraint& c = rows[i];
+		//Px1DConstraintInit(c);
+		c.minImpulse = -PX_MAX_REAL;
+		c.maxImpulse = PX_MAX_REAL;
+	}
+
+	prepDesc.mInvMassScales.linear0 = prepDesc.mInvMassScales.linear1 = prepDesc.mInvMassScales.angular0 = prepDesc.mInvMassScales.angular1 = 1.f;
+
+	PxVec3 body0WorldOffset(0.f);
+	PxU32 constraintCount = (*shaderDesc.solverPrep)(rows,
+		body0WorldOffset,
+		MAX_CONSTRAINT_ROWS,
+		prepDesc.mInvMassScales,
+		shaderDesc.constantBlock,
+		prepDesc.bodyFrame0, prepDesc.bodyFrame1);
+
+	prepDesc.rows = rows;
+	prepDesc.numRows = constraintCount;
+
+	prepDesc.body0WorldOffset = body0WorldOffset;
+
+	return ConstraintHelper::setupSolverConstraint(prepDesc, allocator, dt, invdt);
+}
+
+}
+
+}