PhysX_3.4/Source/LowLevelDynamics/src/DyConstraintSetup.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587

//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  


#include "foundation/PxMemory.h"
#include "DyConstraintPrep.h"
#include "PxsRigidBody.h"
#include "DySolverConstraint1D.h"
#include "PsSort.h"
#include "DySolverConstraintDesc.h"
#include "PxcConstraintBlockStream.h"
#include "DyArticulationContactPrep.h"
#include "PsFoundation.h"

namespace physx
{
namespace Dy
{
	// dsequeira:
	//
	// we can choose any linear combination of equality constraints and get the same solution
	// Hence we can orthogonalize the constraints using the inner product given by the
	// inverse mass matrix, so that when we use PGS, solving a constraint row for a joint 
	// don't disturb the solution of prior rows.
	//
	// We also eliminate the equality constraints from the hard inequality constraints - 
	// (essentially projecting the direction corresponding to the lagrange multiplier 
	// onto the equality constraint subspace) but 'til I've verified this generates 
	// exactly the same KKT/complementarity conditions, status is 'experimental'. 	
	//
	// since for equality constraints the resulting rows have the property that applying
	// an impulse along one row doesn't alter the projected velocity along another row, 
	// all equality constraints (plus one inequality constraint) can be processed in parallel
	// using SIMD
	//
	// Eliminating the inequality constraints from each other would require a solver change
	// and not give us any more parallelism, although we might get better convergence.

namespace
{
	PX_FORCE_INLINE Vec3V V3FromV4(Vec4V x)			{ return Vec3V_From_Vec4V(x); }
	PX_FORCE_INLINE Vec3V V3FromV4Unsafe(Vec4V x)	{ return Vec3V_From_Vec4V_WUndefined(x); }
	PX_FORCE_INLINE Vec4V V4FromV3(Vec3V x)			{ return Vec4V_From_Vec3V(x); }
	//PX_FORCE_INLINE Vec4V V4ClearW(Vec4V x)			{ return V4SetW(x, FZero()); }

struct MassProps
{
	FloatV invMass0;
	FloatV invMass1;
	FloatV invInertiaScale0;
	FloatV invInertiaScale1;

	PX_FORCE_INLINE MassProps(const PxSolverBodyData& bd0,
							  const PxSolverBodyData& bd1,
							  const PxConstraintInvMassScale& ims)
	:	
		invMass0(FLoad(bd0.invMass * ims.linear0))
	,	invMass1(FLoad(bd1.invMass * ims.linear1))
	,	invInertiaScale0(FLoad(ims.angular0))
	,	invInertiaScale1(FLoad(ims.angular1))
	{}
};


PX_FORCE_INLINE PxReal innerProduct(const Px1DConstraint& row0, Px1DConstraint& row1, 
								 PxVec4& row0AngSqrtInvInertia0, PxVec4& row0AngSqrtInvInertia1, 
								 PxVec4& row1AngSqrtInvInertia0, PxVec4& row1AngSqrtInvInertia1, const MassProps& m)
{
	const Vec3V l0 = V3Mul(V3Scale(V3LoadA(row0.linear0), m.invMass0), V3LoadA(row1.linear0));
	const Vec3V l1 = V3Mul(V3Scale(V3LoadA(row0.linear1), m.invMass1), V3LoadA(row1.linear1));
	Vec4V r0ang0 = V4LoadA(&row0AngSqrtInvInertia0.x);
	Vec4V r1ang0 = V4LoadA(&row1AngSqrtInvInertia0.x);
	Vec4V r0ang1 = V4LoadA(&row0AngSqrtInvInertia1.x);
	Vec4V r1ang1 = V4LoadA(&row1AngSqrtInvInertia1.x);

	const Vec3V i0 = V3ScaleAdd(V3Mul(Vec3V_From_Vec4V(r0ang0), Vec3V_From_Vec4V(r1ang0)), m.invInertiaScale0, l0);
	const Vec3V i1 = V3ScaleAdd(V3MulAdd(Vec3V_From_Vec4V(r0ang1), Vec3V_From_Vec4V(r1ang1), i0), m.invInertiaScale1, l1);
	PxF32 f;
	FStore(V3SumElems(i1), &f);
	return f;
}


// indexed rotation around axis, with sine and cosine of half-angle
PX_FORCE_INLINE PxQuat indexedRotation(PxU32 axis, PxReal s, PxReal c)
{
	PxQuat q(0,0,0,c);
	reinterpret_cast<PxReal*>(&q)[axis] = s;
	return q;
}

PxQuat diagonalize(const PxMat33& m)	// jacobi rotation using quaternions 
{
	const PxU32 MAX_ITERS = 5;

	PxQuat q = PxQuat(PxIdentity);

	PxMat33 d;
	for(PxU32 i=0; i < MAX_ITERS;i++)
	{
		const PxMat33 axes(q);
		d = axes.getTranspose() * m * axes;

		const PxReal d0 = PxAbs(d[1][2]), d1 = PxAbs(d[0][2]), d2 = PxAbs(d[0][1]);
		const PxU32 a = PxU32(d0 > d1 && d0 > d2 ? 0 : d1 > d2 ? 1 : 2);						// rotation axis index, from largest off-diagonal element

		const PxU32 a1 = Ps::getNextIndex3(a), a2 = Ps::getNextIndex3(a1);											
		if(d[a1][a2] == 0.0f || PxAbs(d[a1][a1]-d[a2][a2]) > 2e6f*PxAbs(2.0f*d[a1][a2]))
			break;

		const PxReal w = (d[a1][a1]-d[a2][a2]) / (2.0f*d[a1][a2]);					// cot(2 * phi), where phi is the rotation angle
		const PxReal absw = PxAbs(w);

		PxQuat r;
		if(absw>1000)
			r = indexedRotation(a, 1.0f/(4.0f*w), 1.f);									// h will be very close to 1, so use small angle approx instead
		else
		{
  			const PxReal t = 1 / (absw + PxSqrt(w*w+1));								// absolute value of tan phi
			const PxReal h = 1 / PxSqrt(t*t+1);										// absolute value of cos phi

			PX_ASSERT(h!=1);													// |w|<1000 guarantees this with typical IEEE754 machine eps (approx 6e-8)
			r = indexedRotation(a, PxSqrt((1-h)/2) * PxSign(w), PxSqrt((1+h)/2));
		}
	
		q = (q*r).getNormalized();
	}

	return q;
}


PX_FORCE_INLINE void rescale(const Mat33V& m, PxVec3& a0, PxVec3& a1, PxVec3& a2)
{
	const Vec3V va0 = V3LoadU(a0);
	const Vec3V va1 = V3LoadU(a1);
	const Vec3V va2 = V3LoadU(a2);

	const Vec3V b0 = V3ScaleAdd(va0, V3GetX(m.col0), V3ScaleAdd(va1, V3GetY(m.col0), V3Scale(va2, V3GetZ(m.col0))));
	const Vec3V b1 = V3ScaleAdd(va0, V3GetX(m.col1), V3ScaleAdd(va1, V3GetY(m.col1), V3Scale(va2, V3GetZ(m.col1))));
	const Vec3V b2 = V3ScaleAdd(va0, V3GetX(m.col2), V3ScaleAdd(va1, V3GetY(m.col2), V3Scale(va2, V3GetZ(m.col2))));

	V3StoreU(b0, a0);
	V3StoreU(b1, a1);
	V3StoreU(b2, a2);
}

PX_FORCE_INLINE void rescale4(const Mat33V& m, PxReal* a0, PxReal* a1, PxReal* a2)
{
	const Vec4V va0 = V4LoadA(a0);
	const Vec4V va1 = V4LoadA(a1);
	const Vec4V va2 = V4LoadA(a2);

	const Vec4V b0 = V4ScaleAdd(va0, V3GetX(m.col0), V4ScaleAdd(va1, V3GetY(m.col0), V4Scale(va2, V3GetZ(m.col0))));
	const Vec4V b1 = V4ScaleAdd(va0, V3GetX(m.col1), V4ScaleAdd(va1, V3GetY(m.col1), V4Scale(va2, V3GetZ(m.col1))));
	const Vec4V b2 = V4ScaleAdd(va0, V3GetX(m.col2), V4ScaleAdd(va1, V3GetY(m.col2), V4Scale(va2, V3GetZ(m.col2))));

	V4StoreA(b0, a0);
	V4StoreA(b1, a1);
	V4StoreA(b2, a2);
}

void diagonalize(Px1DConstraint** row,
				 PxVec4* angSqrtInvInertia0,
				 PxVec4* angSqrtInvInertia1,
				 const MassProps &m)
{
	PxReal a00 = innerProduct(*row[0], *row[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], m);
	PxReal a01 = innerProduct(*row[0], *row[1], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
	PxReal a02 = innerProduct(*row[0], *row[2], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
	PxReal a11 = innerProduct(*row[1], *row[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
	PxReal a12 = innerProduct(*row[1], *row[2], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
	PxReal a22 = innerProduct(*row[2], *row[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);

	PxMat33 a(PxVec3(a00, a01, a02),
			  PxVec3(a01, a11, a12),
			  PxVec3(a02, a12, a22));

	PxQuat q = diagonalize(a);

	PxMat33 n(-q);

	Mat33V mn(V3LoadU(n.column0), V3LoadU(n.column1), V3LoadU(n.column2));

	//KS - We treat as a Vec4V so that we get geometricError rescaled for free along with linear0
	rescale4(mn, &row[0]->linear0.x, &row[1]->linear0.x, &row[2]->linear0.x);
	rescale(mn, row[0]->linear1, row[1]->linear1, row[2]->linear1);
	//KS - We treat as a PxVec4 so that we get velocityTarget rescaled for free 
	rescale4(mn, &row[0]->angular0.x, &row[1]->angular0.x, &row[2]->angular0.x);
	rescale(mn, row[0]->angular1, row[1]->angular1, row[2]->angular1);
	rescale4(mn, &angSqrtInvInertia0[0].x, &angSqrtInvInertia0[1].x, &angSqrtInvInertia0[2].x);
	rescale4(mn, &angSqrtInvInertia1[0].x, &angSqrtInvInertia1[1].x, &angSqrtInvInertia1[2].x);
	
}

void orthogonalize(Px1DConstraint** row,
				   PxVec4* angSqrtInvInertia0,
				   PxVec4* angSqrtInvInertia1,
				   PxU32 rowCount,
				   PxU32 eqRowCount,
				   const MassProps &m)
{
	PX_ASSERT(eqRowCount<=6);

	const FloatV zero = FZero();

	Vec3V lin1m[6], ang1m[6], lin1[6], ang1[6];	
	Vec4V lin0m[6], ang0m[6];			// must have 0 in the W-field
	Vec4V lin0AndG[6], ang0AndT[6];

	for(PxU32 i=0;i<rowCount;i++)
	{
		Vec4V l0AndG = V4LoadA(&row[i]->linear0.x);		// linear0 and geometric error
		Vec4V a0AndT = V4LoadA(&row[i]->angular0.x);	// angular0 and velocity target

		Vec3V l1 = V3FromV4(V4LoadA(&row[i]->linear1.x));
		Vec3V a1 = V3FromV4(V4LoadA(&row[i]->angular1.x));

		Vec4V angSqrtL0 = V4LoadA(&angSqrtInvInertia0[i].x);
		Vec4V angSqrtL1 = V4LoadA(&angSqrtInvInertia1[i].x);

		PxU32 eliminationRows = PxMin<PxU32>(i, eqRowCount);
		for(PxU32 j=0;j<eliminationRows;j++)
		{
			const Vec3V s0 = V3MulAdd(l1, lin1m[j], V3FromV4Unsafe(V4Mul(l0AndG, lin0m[j])));
			const Vec3V s1 = V3MulAdd(V3FromV4Unsafe(angSqrtL1), ang1m[j], V3FromV4Unsafe(V4Mul(angSqrtL0, ang0m[j])));
			FloatV t = V3SumElems(V3Add(s0, s1));

			l0AndG = V4NegScaleSub(lin0AndG[j], t, l0AndG);
			a0AndT = V4NegScaleSub(ang0AndT[j], t, a0AndT);
			l1 = V3NegScaleSub(lin1[j], t, l1);
			a1 = V3NegScaleSub(ang1[j], t, a1);
			angSqrtL0 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia0[j].x), t, angSqrtL0);
			angSqrtL1 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia1[j].x), t, angSqrtL1);
		}

		V4StoreA(l0AndG, &row[i]->linear0.x);
		V4StoreA(a0AndT, &row[i]->angular0.x);
		V3StoreA(l1, row[i]->linear1);
		V3StoreA(a1, row[i]->angular1);
		V4StoreA(angSqrtL0, &angSqrtInvInertia0[i].x);
		V4StoreA(angSqrtL1, &angSqrtInvInertia1[i].x);

		if(i<eqRowCount)
		{
			lin0AndG[i] = l0AndG;	
			ang0AndT[i] = a0AndT;
			lin1[i] = l1;	
			ang1[i] = a1;	
			
			const Vec3V l0 = V3FromV4(l0AndG);

			const Vec3V l0m = V3Scale(l0, m.invMass0);
			const Vec3V l1m = V3Scale(l1, m.invMass1);
			const Vec4V a0m = V4Scale(angSqrtL0, m.invInertiaScale0);
			const Vec4V a1m = V4Scale(angSqrtL1, m.invInertiaScale1);

			const Vec3V s0 = V3MulAdd(l0, l0m, V3Mul(l1, l1m));
			const Vec4V s1 = V4MulAdd(a0m, angSqrtL0, V4Mul(a1m, angSqrtL1));
			const FloatV s = V3SumElems(V3Add(s0, V3FromV4Unsafe(s1)));
			const FloatV a = FSel(FIsGrtr(s, zero), FRecip(s), zero);	// with mass scaling, it's possible for the inner product of a row to be zero

			lin0m[i] = V4Scale(V4ClearW(V4FromV3(l0m)), a);	
			ang0m[i] = V4Scale(V4ClearW(a0m), a);
			lin1m[i] = V3Scale(l1m, a);
			ang1m[i] = V3Scale(V3FromV4Unsafe(a1m), a);
		}
	}
}
}


void preprocessRows(Px1DConstraint** sorted, 
					Px1DConstraint* rows,
					PxVec4* angSqrtInvInertia0,
					PxVec4* angSqrtInvInertia1,
					PxU32 rowCount,
					const PxSolverBodyData& bd0,
					const PxSolverBodyData& bd1,
					const PxConstraintInvMassScale& ims,
					bool disablePreprocessing,
					bool diagonalizeDrive)
{
	// j is maxed at 12, typically around 7, so insertion sort is fine
	for(PxU32 i=0; i<rowCount; i++)
	{
		Px1DConstraint* r = rows+i;
		
		PxU32 j = i;
		for(;j>0 && r->solveHint < sorted[j-1]->solveHint; j--)
			sorted[j] = sorted[j-1];

		sorted[j] = r;
	}

	for(PxU32 i=0;i<rowCount-1;i++)
		PX_ASSERT(sorted[i]->solveHint <= sorted[i+1]->solveHint);

	for (PxU32 i = 0; i<rowCount; i++)
		rows[i].forInternalUse = rows[i].flags & Px1DConstraintFlag::eKEEPBIAS ? rows[i].geometricError : 0;


	const Mat33V sqrtInvInertia0 = Mat33V(V3LoadU(bd0.sqrtInvInertia.column0), V3LoadU(bd0.sqrtInvInertia.column1),
		V3LoadU(bd0.sqrtInvInertia.column2));

	const Mat33V sqrtInvInertia1 = Mat33V(V3LoadU(bd1.sqrtInvInertia.column0), V3LoadU(bd1.sqrtInvInertia.column1),
		V3LoadU(bd1.sqrtInvInertia.column2));

	PX_ASSERT(((uintptr_t(angSqrtInvInertia0)) & 0xF) == 0);
	PX_ASSERT(((uintptr_t(angSqrtInvInertia1)) & 0xF) == 0);

	for(PxU32 i = 0; i < rowCount; ++i)
	{
		const Vec3V angDelta0 = M33MulV3(sqrtInvInertia0, V3LoadU(sorted[i]->angular0));
		const Vec3V angDelta1 = M33MulV3(sqrtInvInertia1, V3LoadU(sorted[i]->angular1));
		V4StoreA(Vec4V_From_Vec3V(angDelta0), &angSqrtInvInertia0[i].x);
		V4StoreA(Vec4V_From_Vec3V(angDelta1), &angSqrtInvInertia1[i].x);
	}

	if(disablePreprocessing)
		return;

	MassProps m(bd0, bd1, ims);
	for(PxU32 i=0;i<rowCount;)
	{
		const PxU32 groupMajorId = PxU32(sorted[i]->solveHint>>8), start = i++;
		while(i<rowCount && PxU32(sorted[i]->solveHint>>8) == groupMajorId)
			i++;

		if(groupMajorId == 4)
		{
			PxU32 bCount = start;		// count of bilateral constraints 
			for(; bCount<i && (sorted[bCount]->solveHint&255)==0; bCount++)
				;
			orthogonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, i-start, bCount-start, m);
		}

		if(groupMajorId == 1 && diagonalizeDrive)
		{			
			PxU32 slerp = start;		// count of bilateral constraints 
			for(; slerp<i && (sorted[slerp]->solveHint&255)!=2; slerp++)
				;
			if(slerp+3 == i)
				diagonalize(sorted+slerp, angSqrtInvInertia0+slerp, angSqrtInvInertia1+slerp, m);

			PX_ASSERT(i-start==3);
			diagonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, m);
		}
	}
}


PxU32 ConstraintHelper::setupSolverConstraint(
PxSolverConstraintPrepDesc& prepDesc,
PxConstraintAllocator& allocator,
PxReal dt, PxReal invdt)
{
	if (prepDesc.numRows == 0)
	{
		prepDesc.desc->constraint = NULL;
		prepDesc.desc->writeBack = NULL;
		prepDesc.desc->constraintLengthOver16 = 0;
		prepDesc.desc->writeBackLengthOver4 = 0;
		return 0;
	}

	PxSolverConstraintDesc& desc = *prepDesc.desc;

	bool isExtended = desc.linkIndexA != PxSolverConstraintDesc::NO_LINK
		|| desc.linkIndexB != PxSolverConstraintDesc::NO_LINK;

	PxU32 stride = isExtended ? sizeof(SolverConstraint1DExt) : sizeof(SolverConstraint1D);
	const PxU32 constraintLength = sizeof(SolverConstraint1DHeader) + stride * prepDesc.numRows;
	
	//KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we
	//know SPU DMAs have completed)
	PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u);
	if(NULL == ptr || (reinterpret_cast<PxU8*>(-1))==ptr)
	{
		if(NULL==ptr)
		{
			PX_WARN_ONCE(
				"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
				"Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks.");
			return 0;
		}
		else
		{
			PX_WARN_ONCE(
				"Attempting to allocate more than 16K of constraint data. "
				"Either accept joints detaching/exploding or simplify constraints.");
			ptr=NULL;
			return 0;
		}
	}
	desc.constraint = ptr;

	setConstraintLength(desc,constraintLength);

	desc.writeBack = prepDesc.writeback;
	setWritebackLength(desc, sizeof(ConstraintWriteback));

	memset(desc.constraint, 0, constraintLength);

	SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
	PxU8* constraints = desc.constraint + sizeof(SolverConstraint1DHeader);
	init(*header, Ps::to8(prepDesc.numRows), isExtended, prepDesc.mInvMassScales);
	header->body0WorldOffset = prepDesc.body0WorldOffset;
	header->linBreakImpulse = prepDesc.linBreakForce * dt;
	header->angBreakImpulse = prepDesc.angBreakForce * dt;
	header->breakable = PxU8((prepDesc.linBreakForce != PX_MAX_F32) || (prepDesc.angBreakForce != PX_MAX_F32));
	header->invMass0D0 = prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0;
	header->invMass1D1 = prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1;


	PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS];
	PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS];
	
	Px1DConstraint* sorted[MAX_CONSTRAINT_ROWS];

	preprocessRows(sorted, prepDesc.rows, angSqrtInvInertia0, angSqrtInvInertia1, prepDesc.numRows, *prepDesc.data0, *prepDesc.data1, prepDesc.mInvMassScales,
		isExtended || prepDesc.disablePreprocessing, prepDesc.improvedSlerp);

	const PxReal erp = 1.0f;
	for (PxU32 i = 0; i<prepDesc.numRows; i++)
	{
		Ps::prefetchLine(constraints, 128);
		SolverConstraint1D &s = *reinterpret_cast<SolverConstraint1D *>(constraints);
		Px1DConstraint& c = *sorted[i];

		PxReal driveScale = c.flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && prepDesc.driveLimitsAreForces ? PxMin(dt, 1.0f) : 1.0f;

		PxReal unitResponse;
		PxReal normalVel = 0.0f;
		PxReal initVel = 0.f;

		if(!isExtended)
		{
			init(s, c.linear0, c.linear1, PxVec3(angSqrtInvInertia0[i].x, angSqrtInvInertia0[i].y, angSqrtInvInertia0[i].z),
				PxVec3(angSqrtInvInertia1[i].x, angSqrtInvInertia1[i].y, angSqrtInvInertia1[i].z), c.minImpulse * driveScale, c.maxImpulse * driveScale);
			s.ang0Writeback = c.angular0;
			PxReal resp0 = s.lin0.magnitudeSquared() * prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0 + s.ang0.magnitudeSquared() * prepDesc.mInvMassScales.angular0;
			PxReal resp1 = s.lin1.magnitudeSquared() * prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1 + s.ang1.magnitudeSquared() * prepDesc.mInvMassScales.angular1;
			unitResponse = resp0 + resp1;
			initVel = normalVel = prepDesc.data0->projectVelocity(c.linear0, c.angular0) - prepDesc.data1->projectVelocity(c.linear1, c.angular1);
		}
		else
		{
			init(s, c.linear0, c.linear1, c.angular0, c.angular1, c.minImpulse * driveScale, c.maxImpulse * driveScale);
			SolverConstraint1DExt& e = static_cast<SolverConstraint1DExt&>(s);

			const SolverExtBody eb0(reinterpret_cast<const void*>(prepDesc.body0), prepDesc.data0, desc.linkIndexA);
			const SolverExtBody eb1(reinterpret_cast<const void*>(prepDesc.body1), prepDesc.data1, desc.linkIndexB);

			const Cm::SpatialVector resp0 = createImpulseResponseVector(e.lin0, e.ang0, eb0);
			const Cm::SpatialVector resp1 = createImpulseResponseVector(-e.lin1, -e.ang1, eb1);
			unitResponse = getImpulseResponse(eb0, resp0, unsimdRef(e.deltaVA), prepDesc.mInvMassScales.linear0, prepDesc.mInvMassScales.angular0,
				eb1, resp1, unsimdRef(e.deltaVB), prepDesc.mInvMassScales.linear1, prepDesc.mInvMassScales.angular1, true);

			s.ang0Writeback = c.angular0;
			s.lin0 = resp0.linear;
			s.ang0 = resp0.angular;
			s.lin1 = -resp1.linear;
			s.ang1 = -resp1.angular;
			PxReal vel0, vel1;
			if(needsNormalVel(c) || eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK || eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
			{
				vel0 = eb0.projectVelocity(c.linear0, c.angular0);
				vel1 = eb1.projectVelocity(c.linear1, c.angular1);

				normalVel = vel0 - vel1;

				//normalVel = eb0.projectVelocity(s.lin0, s.ang0) - eb1.projectVelocity(s.lin1, s.ang1);
				if(eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
					initVel = vel0;
				else if(eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
					initVel = -vel1;

			}
		}

		setSolverConstants(s.constant, s.unbiasedConstant, s.velMultiplier, s.impulseMultiplier, 
			c, normalVel, unitResponse, prepDesc.minResponseThreshold, erp, dt, invdt);

		//s.targetVelocity = initVel;
		const PxReal velBias = initVel * s.velMultiplier;
		s.constant += velBias;
		s.unbiasedConstant += velBias;

		if(c.flags & Px1DConstraintFlag::eOUTPUT_FORCE)
			s.flags |= DY_SC_FLAG_OUTPUT_FORCE;

		constraints += stride;
	}

	//KS - Set the solve count at the end to 0 
	*(reinterpret_cast<PxU32*>(constraints)) = 0;
	*(reinterpret_cast<PxU32*>(constraints + 4)) = 0;
	PX_ASSERT(desc.constraint + getConstraintLength(desc) == constraints);
	return prepDesc.numRows;
}

PxU32 SetupSolverConstraint(SolverConstraintShaderPrepDesc& shaderDesc,
	PxSolverConstraintPrepDesc& prepDesc,
	PxConstraintAllocator& allocator,
	PxReal dt, PxReal invdt)
{
	// LL shouldn't see broken constraints
	
	PX_ASSERT(!(reinterpret_cast<ConstraintWriteback*>(prepDesc.writeback)->broken));

	setConstraintLength(*prepDesc.desc, 0);

	if (!shaderDesc.solverPrep)
		return 0;

	//PxU32 numAxisConstraints = 0;

	Px1DConstraint rows[MAX_CONSTRAINT_ROWS];

	// This is necessary so that there will be sensible defaults and shaders will
	// continue to work (albeit with a recompile) if the row format changes.
	// It's a bit inefficient because it fills in all constraint rows even if there
	// is only going to be one generated. A way around this would be for the shader to
	// specify the maximum number of rows it needs, or it could call a subroutine to
	// prep the row before it starts filling it it.

	PxMemZero(rows, sizeof(Px1DConstraint)*MAX_CONSTRAINT_ROWS);

	for (PxU32 i = 0; i<MAX_CONSTRAINT_ROWS; i++)
	{
		Px1DConstraint& c = rows[i];
		//Px1DConstraintInit(c);
		c.minImpulse = -PX_MAX_REAL;
		c.maxImpulse = PX_MAX_REAL;
	}

	prepDesc.mInvMassScales.linear0 = prepDesc.mInvMassScales.linear1 = prepDesc.mInvMassScales.angular0 = prepDesc.mInvMassScales.angular1 = 1.f;

	PxVec3 body0WorldOffset(0.f);
	PxU32 constraintCount = (*shaderDesc.solverPrep)(rows,
		body0WorldOffset,
		MAX_CONSTRAINT_ROWS,
		prepDesc.mInvMassScales,
		shaderDesc.constantBlock,
		prepDesc.bodyFrame0, prepDesc.bodyFrame1);

	prepDesc.rows = rows;
	prepDesc.numRows = constraintCount;

	prepDesc.body0WorldOffset = body0WorldOffset;

	return ConstraintHelper::setupSolverConstraint(prepDesc, allocator, dt, invdt);
}

}

}