// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "foundation/PxMemory.h" #include "DyConstraintPrep.h" #include "PxsRigidBody.h" #include "DySolverConstraint1D.h" #include "PsSort.h" #include "DySolverConstraintDesc.h" #include "PxcConstraintBlockStream.h" #include "DyArticulationContactPrep.h" #include "PsFoundation.h" namespace physx { namespace Dy { // dsequeira: // // we can choose any linear combination of equality constraints and get the same solution // Hence we can orthogonalize the constraints using the inner product given by the // inverse mass matrix, so that when we use PGS, solving a constraint row for a joint // don't disturb the solution of prior rows. // // We also eliminate the equality constraints from the hard inequality constraints - // (essentially projecting the direction corresponding to the lagrange multiplier // onto the equality constraint subspace) but 'til I've verified this generates // exactly the same KKT/complementarity conditions, status is 'experimental'. // // since for equality constraints the resulting rows have the property that applying // an impulse along one row doesn't alter the projected velocity along another row, // all equality constraints (plus one inequality constraint) can be processed in parallel // using SIMD // // Eliminating the inequality constraints from each other would require a solver change // and not give us any more parallelism, although we might get better convergence. namespace { PX_FORCE_INLINE Vec3V V3FromV4(Vec4V x) { return Vec3V_From_Vec4V(x); } PX_FORCE_INLINE Vec3V V3FromV4Unsafe(Vec4V x) { return Vec3V_From_Vec4V_WUndefined(x); } PX_FORCE_INLINE Vec4V V4FromV3(Vec3V x) { return Vec4V_From_Vec3V(x); } //PX_FORCE_INLINE Vec4V V4ClearW(Vec4V x) { return V4SetW(x, FZero()); } struct MassProps { FloatV invMass0; FloatV invMass1; FloatV invInertiaScale0; FloatV invInertiaScale1; PX_FORCE_INLINE MassProps(const PxSolverBodyData& bd0, const PxSolverBodyData& bd1, const PxConstraintInvMassScale& ims) : invMass0(FLoad(bd0.invMass * ims.linear0)) , invMass1(FLoad(bd1.invMass * ims.linear1)) , invInertiaScale0(FLoad(ims.angular0)) , invInertiaScale1(FLoad(ims.angular1)) {} }; PX_FORCE_INLINE PxReal innerProduct(const Px1DConstraint& row0, Px1DConstraint& row1, PxVec4& row0AngSqrtInvInertia0, PxVec4& row0AngSqrtInvInertia1, PxVec4& row1AngSqrtInvInertia0, PxVec4& row1AngSqrtInvInertia1, const MassProps& m) { const Vec3V l0 = V3Mul(V3Scale(V3LoadA(row0.linear0), m.invMass0), V3LoadA(row1.linear0)); const Vec3V l1 = V3Mul(V3Scale(V3LoadA(row0.linear1), m.invMass1), V3LoadA(row1.linear1)); Vec4V r0ang0 = V4LoadA(&row0AngSqrtInvInertia0.x); Vec4V r1ang0 = V4LoadA(&row1AngSqrtInvInertia0.x); Vec4V r0ang1 = V4LoadA(&row0AngSqrtInvInertia1.x); Vec4V r1ang1 = V4LoadA(&row1AngSqrtInvInertia1.x); const Vec3V i0 = V3ScaleAdd(V3Mul(Vec3V_From_Vec4V(r0ang0), Vec3V_From_Vec4V(r1ang0)), m.invInertiaScale0, l0); const Vec3V i1 = V3ScaleAdd(V3MulAdd(Vec3V_From_Vec4V(r0ang1), Vec3V_From_Vec4V(r1ang1), i0), m.invInertiaScale1, l1); PxF32 f; FStore(V3SumElems(i1), &f); return f; } // indexed rotation around axis, with sine and cosine of half-angle PX_FORCE_INLINE PxQuat indexedRotation(PxU32 axis, PxReal s, PxReal c) { PxQuat q(0,0,0,c); reinterpret_cast(&q)[axis] = s; return q; } PxQuat diagonalize(const PxMat33& m) // jacobi rotation using quaternions { const PxU32 MAX_ITERS = 5; PxQuat q = PxQuat(PxIdentity); PxMat33 d; for(PxU32 i=0; i < MAX_ITERS;i++) { const PxMat33 axes(q); d = axes.getTranspose() * m * axes; const PxReal d0 = PxAbs(d[1][2]), d1 = PxAbs(d[0][2]), d2 = PxAbs(d[0][1]); const PxU32 a = PxU32(d0 > d1 && d0 > d2 ? 0 : d1 > d2 ? 1 : 2); // rotation axis index, from largest off-diagonal element const PxU32 a1 = Ps::getNextIndex3(a), a2 = Ps::getNextIndex3(a1); if(d[a1][a2] == 0.0f || PxAbs(d[a1][a1]-d[a2][a2]) > 2e6f*PxAbs(2.0f*d[a1][a2])) break; const PxReal w = (d[a1][a1]-d[a2][a2]) / (2.0f*d[a1][a2]); // cot(2 * phi), where phi is the rotation angle const PxReal absw = PxAbs(w); PxQuat r; if(absw>1000) r = indexedRotation(a, 1.0f/(4.0f*w), 1.f); // h will be very close to 1, so use small angle approx instead else { const PxReal t = 1 / (absw + PxSqrt(w*w+1)); // absolute value of tan phi const PxReal h = 1 / PxSqrt(t*t+1); // absolute value of cos phi PX_ASSERT(h!=1); // |w|<1000 guarantees this with typical IEEE754 machine eps (approx 6e-8) r = indexedRotation(a, PxSqrt((1-h)/2) * PxSign(w), PxSqrt((1+h)/2)); } q = (q*r).getNormalized(); } return q; } PX_FORCE_INLINE void rescale(const Mat33V& m, PxVec3& a0, PxVec3& a1, PxVec3& a2) { const Vec3V va0 = V3LoadU(a0); const Vec3V va1 = V3LoadU(a1); const Vec3V va2 = V3LoadU(a2); const Vec3V b0 = V3ScaleAdd(va0, V3GetX(m.col0), V3ScaleAdd(va1, V3GetY(m.col0), V3Scale(va2, V3GetZ(m.col0)))); const Vec3V b1 = V3ScaleAdd(va0, V3GetX(m.col1), V3ScaleAdd(va1, V3GetY(m.col1), V3Scale(va2, V3GetZ(m.col1)))); const Vec3V b2 = V3ScaleAdd(va0, V3GetX(m.col2), V3ScaleAdd(va1, V3GetY(m.col2), V3Scale(va2, V3GetZ(m.col2)))); V3StoreU(b0, a0); V3StoreU(b1, a1); V3StoreU(b2, a2); } PX_FORCE_INLINE void rescale4(const Mat33V& m, PxReal* a0, PxReal* a1, PxReal* a2) { const Vec4V va0 = V4LoadA(a0); const Vec4V va1 = V4LoadA(a1); const Vec4V va2 = V4LoadA(a2); const Vec4V b0 = V4ScaleAdd(va0, V3GetX(m.col0), V4ScaleAdd(va1, V3GetY(m.col0), V4Scale(va2, V3GetZ(m.col0)))); const Vec4V b1 = V4ScaleAdd(va0, V3GetX(m.col1), V4ScaleAdd(va1, V3GetY(m.col1), V4Scale(va2, V3GetZ(m.col1)))); const Vec4V b2 = V4ScaleAdd(va0, V3GetX(m.col2), V4ScaleAdd(va1, V3GetY(m.col2), V4Scale(va2, V3GetZ(m.col2)))); V4StoreA(b0, a0); V4StoreA(b1, a1); V4StoreA(b2, a2); } void diagonalize(Px1DConstraint** row, PxVec4* angSqrtInvInertia0, PxVec4* angSqrtInvInertia1, const MassProps &m) { PxReal a00 = innerProduct(*row[0], *row[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], m); PxReal a01 = innerProduct(*row[0], *row[1], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m); PxReal a02 = innerProduct(*row[0], *row[2], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m); PxReal a11 = innerProduct(*row[1], *row[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m); PxReal a12 = innerProduct(*row[1], *row[2], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m); PxReal a22 = innerProduct(*row[2], *row[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m); PxMat33 a(PxVec3(a00, a01, a02), PxVec3(a01, a11, a12), PxVec3(a02, a12, a22)); PxQuat q = diagonalize(a); PxMat33 n(-q); Mat33V mn(V3LoadU(n.column0), V3LoadU(n.column1), V3LoadU(n.column2)); //KS - We treat as a Vec4V so that we get geometricError rescaled for free along with linear0 rescale4(mn, &row[0]->linear0.x, &row[1]->linear0.x, &row[2]->linear0.x); rescale(mn, row[0]->linear1, row[1]->linear1, row[2]->linear1); //KS - We treat as a PxVec4 so that we get velocityTarget rescaled for free rescale4(mn, &row[0]->angular0.x, &row[1]->angular0.x, &row[2]->angular0.x); rescale(mn, row[0]->angular1, row[1]->angular1, row[2]->angular1); rescale4(mn, &angSqrtInvInertia0[0].x, &angSqrtInvInertia0[1].x, &angSqrtInvInertia0[2].x); rescale4(mn, &angSqrtInvInertia1[0].x, &angSqrtInvInertia1[1].x, &angSqrtInvInertia1[2].x); } void orthogonalize(Px1DConstraint** row, PxVec4* angSqrtInvInertia0, PxVec4* angSqrtInvInertia1, PxU32 rowCount, PxU32 eqRowCount, const MassProps &m) { PX_ASSERT(eqRowCount<=6); const FloatV zero = FZero(); Vec3V lin1m[6], ang1m[6], lin1[6], ang1[6]; Vec4V lin0m[6], ang0m[6]; // must have 0 in the W-field Vec4V lin0AndG[6], ang0AndT[6]; for(PxU32 i=0;ilinear0.x); // linear0 and geometric error Vec4V a0AndT = V4LoadA(&row[i]->angular0.x); // angular0 and velocity target Vec3V l1 = V3FromV4(V4LoadA(&row[i]->linear1.x)); Vec3V a1 = V3FromV4(V4LoadA(&row[i]->angular1.x)); Vec4V angSqrtL0 = V4LoadA(&angSqrtInvInertia0[i].x); Vec4V angSqrtL1 = V4LoadA(&angSqrtInvInertia1[i].x); PxU32 eliminationRows = PxMin(i, eqRowCount); for(PxU32 j=0;jlinear0.x); V4StoreA(a0AndT, &row[i]->angular0.x); V3StoreA(l1, row[i]->linear1); V3StoreA(a1, row[i]->angular1); V4StoreA(angSqrtL0, &angSqrtInvInertia0[i].x); V4StoreA(angSqrtL1, &angSqrtInvInertia1[i].x); if(i0 && r->solveHint < sorted[j-1]->solveHint; j--) sorted[j] = sorted[j-1]; sorted[j] = r; } for(PxU32 i=0;isolveHint <= sorted[i+1]->solveHint); for (PxU32 i = 0; iangular0)); const Vec3V angDelta1 = M33MulV3(sqrtInvInertia1, V3LoadU(sorted[i]->angular1)); V4StoreA(Vec4V_From_Vec3V(angDelta0), &angSqrtInvInertia0[i].x); V4StoreA(Vec4V_From_Vec3V(angDelta1), &angSqrtInvInertia1[i].x); } if(disablePreprocessing) return; MassProps m(bd0, bd1, ims); for(PxU32 i=0;isolveHint>>8), start = i++; while(isolveHint>>8) == groupMajorId) i++; if(groupMajorId == 4) { PxU32 bCount = start; // count of bilateral constraints for(; bCountsolveHint&255)==0; bCount++) ; orthogonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, i-start, bCount-start, m); } if(groupMajorId == 1 && diagonalizeDrive) { PxU32 slerp = start; // count of bilateral constraints for(; slerpsolveHint&255)!=2; slerp++) ; if(slerp+3 == i) diagonalize(sorted+slerp, angSqrtInvInertia0+slerp, angSqrtInvInertia1+slerp, m); PX_ASSERT(i-start==3); diagonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, m); } } } PxU32 ConstraintHelper::setupSolverConstraint( PxSolverConstraintPrepDesc& prepDesc, PxConstraintAllocator& allocator, PxReal dt, PxReal invdt) { if (prepDesc.numRows == 0) { prepDesc.desc->constraint = NULL; prepDesc.desc->writeBack = NULL; prepDesc.desc->constraintLengthOver16 = 0; prepDesc.desc->writeBackLengthOver4 = 0; return 0; } PxSolverConstraintDesc& desc = *prepDesc.desc; bool isExtended = desc.linkIndexA != PxSolverConstraintDesc::NO_LINK || desc.linkIndexB != PxSolverConstraintDesc::NO_LINK; PxU32 stride = isExtended ? sizeof(SolverConstraint1DExt) : sizeof(SolverConstraint1D); const PxU32 constraintLength = sizeof(SolverConstraint1DHeader) + stride * prepDesc.numRows; //KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we //know SPU DMAs have completed) PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u); if(NULL == ptr || (reinterpret_cast(-1))==ptr) { if(NULL==ptr) { PX_WARN_ONCE( "Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. " "Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks."); return 0; } else { PX_WARN_ONCE( "Attempting to allocate more than 16K of constraint data. " "Either accept joints detaching/exploding or simplify constraints."); ptr=NULL; return 0; } } desc.constraint = ptr; setConstraintLength(desc,constraintLength); desc.writeBack = prepDesc.writeback; setWritebackLength(desc, sizeof(ConstraintWriteback)); memset(desc.constraint, 0, constraintLength); SolverConstraint1DHeader* header = reinterpret_cast(desc.constraint); PxU8* constraints = desc.constraint + sizeof(SolverConstraint1DHeader); init(*header, Ps::to8(prepDesc.numRows), isExtended, prepDesc.mInvMassScales); header->body0WorldOffset = prepDesc.body0WorldOffset; header->linBreakImpulse = prepDesc.linBreakForce * dt; header->angBreakImpulse = prepDesc.angBreakForce * dt; header->breakable = PxU8((prepDesc.linBreakForce != PX_MAX_F32) || (prepDesc.angBreakForce != PX_MAX_F32)); header->invMass0D0 = prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0; header->invMass1D1 = prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1; PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS]; PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS]; Px1DConstraint* sorted[MAX_CONSTRAINT_ROWS]; preprocessRows(sorted, prepDesc.rows, angSqrtInvInertia0, angSqrtInvInertia1, prepDesc.numRows, *prepDesc.data0, *prepDesc.data1, prepDesc.mInvMassScales, isExtended || prepDesc.disablePreprocessing, prepDesc.improvedSlerp); const PxReal erp = 1.0f; for (PxU32 i = 0; i(constraints); Px1DConstraint& c = *sorted[i]; PxReal driveScale = c.flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && prepDesc.driveLimitsAreForces ? PxMin(dt, 1.0f) : 1.0f; PxReal unitResponse; PxReal normalVel = 0.0f; PxReal initVel = 0.f; if(!isExtended) { init(s, c.linear0, c.linear1, PxVec3(angSqrtInvInertia0[i].x, angSqrtInvInertia0[i].y, angSqrtInvInertia0[i].z), PxVec3(angSqrtInvInertia1[i].x, angSqrtInvInertia1[i].y, angSqrtInvInertia1[i].z), c.minImpulse * driveScale, c.maxImpulse * driveScale); s.ang0Writeback = c.angular0; PxReal resp0 = s.lin0.magnitudeSquared() * prepDesc.data0->invMass * prepDesc.mInvMassScales.linear0 + s.ang0.magnitudeSquared() * prepDesc.mInvMassScales.angular0; PxReal resp1 = s.lin1.magnitudeSquared() * prepDesc.data1->invMass * prepDesc.mInvMassScales.linear1 + s.ang1.magnitudeSquared() * prepDesc.mInvMassScales.angular1; unitResponse = resp0 + resp1; initVel = normalVel = prepDesc.data0->projectVelocity(c.linear0, c.angular0) - prepDesc.data1->projectVelocity(c.linear1, c.angular1); } else { init(s, c.linear0, c.linear1, c.angular0, c.angular1, c.minImpulse * driveScale, c.maxImpulse * driveScale); SolverConstraint1DExt& e = static_cast(s); const SolverExtBody eb0(reinterpret_cast(prepDesc.body0), prepDesc.data0, desc.linkIndexA); const SolverExtBody eb1(reinterpret_cast(prepDesc.body1), prepDesc.data1, desc.linkIndexB); const Cm::SpatialVector resp0 = createImpulseResponseVector(e.lin0, e.ang0, eb0); const Cm::SpatialVector resp1 = createImpulseResponseVector(-e.lin1, -e.ang1, eb1); unitResponse = getImpulseResponse(eb0, resp0, unsimdRef(e.deltaVA), prepDesc.mInvMassScales.linear0, prepDesc.mInvMassScales.angular0, eb1, resp1, unsimdRef(e.deltaVB), prepDesc.mInvMassScales.linear1, prepDesc.mInvMassScales.angular1, true); s.ang0Writeback = c.angular0; s.lin0 = resp0.linear; s.ang0 = resp0.angular; s.lin1 = -resp1.linear; s.ang1 = -resp1.angular; PxReal vel0, vel1; if(needsNormalVel(c) || eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK || eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK) { vel0 = eb0.projectVelocity(c.linear0, c.angular0); vel1 = eb1.projectVelocity(c.linear1, c.angular1); normalVel = vel0 - vel1; //normalVel = eb0.projectVelocity(s.lin0, s.ang0) - eb1.projectVelocity(s.lin1, s.ang1); if(eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK) initVel = vel0; else if(eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK) initVel = -vel1; } } setSolverConstants(s.constant, s.unbiasedConstant, s.velMultiplier, s.impulseMultiplier, c, normalVel, unitResponse, prepDesc.minResponseThreshold, erp, dt, invdt); //s.targetVelocity = initVel; const PxReal velBias = initVel * s.velMultiplier; s.constant += velBias; s.unbiasedConstant += velBias; if(c.flags & Px1DConstraintFlag::eOUTPUT_FORCE) s.flags |= DY_SC_FLAG_OUTPUT_FORCE; constraints += stride; } //KS - Set the solve count at the end to 0 *(reinterpret_cast(constraints)) = 0; *(reinterpret_cast(constraints + 4)) = 0; PX_ASSERT(desc.constraint + getConstraintLength(desc) == constraints); return prepDesc.numRows; } PxU32 SetupSolverConstraint(SolverConstraintShaderPrepDesc& shaderDesc, PxSolverConstraintPrepDesc& prepDesc, PxConstraintAllocator& allocator, PxReal dt, PxReal invdt) { // LL shouldn't see broken constraints PX_ASSERT(!(reinterpret_cast(prepDesc.writeback)->broken)); setConstraintLength(*prepDesc.desc, 0); if (!shaderDesc.solverPrep) return 0; //PxU32 numAxisConstraints = 0; Px1DConstraint rows[MAX_CONSTRAINT_ROWS]; // This is necessary so that there will be sensible defaults and shaders will // continue to work (albeit with a recompile) if the row format changes. // It's a bit inefficient because it fills in all constraint rows even if there // is only going to be one generated. A way around this would be for the shader to // specify the maximum number of rows it needs, or it could call a subroutine to // prep the row before it starts filling it it. PxMemZero(rows, sizeof(Px1DConstraint)*MAX_CONSTRAINT_ROWS); for (PxU32 i = 0; i