aboutsummaryrefslogtreecommitdiff
path: root/PhysX_3.4/Source/LowLevelParticles/src/PtDynamicsKernels.h
diff options
context:
space:
mode:
authorgit perforce import user <a@b>2016-10-25 12:29:14 -0600
committerSheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>2016-10-25 18:56:37 -0500
commit3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
treefa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelParticles/src/PtDynamicsKernels.h
downloadphysx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
Initial commit:
PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
Diffstat (limited to 'PhysX_3.4/Source/LowLevelParticles/src/PtDynamicsKernels.h')
-rw-r--r--PhysX_3.4/Source/LowLevelParticles/src/PtDynamicsKernels.h1105
1 files changed, 1105 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelParticles/src/PtDynamicsKernels.h b/PhysX_3.4/Source/LowLevelParticles/src/PtDynamicsKernels.h
new file mode 100644
index 00000000..94494072
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelParticles/src/PtDynamicsKernels.h
@@ -0,0 +1,1105 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodesX AG. All rights reserved.
+
+#ifndef PT_DYNAMICS_KERNELS_H
+#define PT_DYNAMICS_KERNELS_H
+
+#include "PxPhysXConfig.h"
+#if PX_USE_PARTICLE_SYSTEM_API
+
+#include "PsFPU.h"
+#include "foundation/PxUnionCast.h"
+
+#include "PtDynamicsParameters.h"
+
+#define REFERENCE_KERNELS 0
+
+#if !REFERENCE_KERNELS
+#include "PsVecMath.h"
+#endif
+
+namespace physx
+{
+namespace Pt
+{
+
+using namespace Ps;
+using namespace aos;
+
+#define COMPILE_IN_SIMD_DENSITY 1
+#define PX_FORCE_INLINE_KERNELS PX_FORCE_INLINE
+
+#define MAX_INDEX_STREAM_SIZE 128
+#define PRESSURE_ORIGIN 1
+
+PX_FORCE_INLINE PxF32 calcDensity(const PxF32 distSqr, const DynamicsParameters& params)
+{
+ PxF32 dist2Std = distSqr * params.scaleSqToStd;
+ PxF32 radius2MinusDist2Std = params.radiusSqStd - dist2Std;
+ PxF32 densityStd = params.densityMultiplierStd * radius2MinusDist2Std * radius2MinusDist2Std * radius2MinusDist2Std;
+ return densityStd;
+}
+
+PX_FORCE_INLINE void addDensity(Particle& particleDst, const PxF32 distSqr, const DynamicsParameters& params)
+{
+ PX_ASSERT(distSqr <= params.cellSizeSq);
+ PxF32 densityStd = calcDensity(distSqr, params);
+ particleDst.density += densityStd;
+}
+
+PX_FORCE_INLINE void addDensity_twoWay(Particle& particleA, Particle& particleB, const PxF32 distSqr,
+ const DynamicsParameters& params)
+{
+ PX_ASSERT(distSqr <= params.cellSizeSq);
+ PxF32 densityStd = calcDensity(distSqr, params);
+ particleA.density += densityStd;
+ particleB.density += densityStd;
+}
+
+PX_FORCE_INLINE PxVec3 calcForce(const Particle& particleA, const Particle& particleB, const PxF32 distSqr,
+ const PxVec3& distVec, const DynamicsParameters& params)
+{
+ PxReal dist2Std = distSqr * params.scaleSqToStd;
+
+ PxReal recipDistStd = physx::intrinsics::recipSqrtFast(dist2Std);
+ PxReal distStd = dist2Std * recipDistStd;
+
+ PxReal radiusMinusDistStd = params.radiusStd - distStd;
+
+// pressure force
+#if PRESSURE_ORIGIN
+ PxF32 pressureA = PxMax(particleA.density - params.initialDensity, 0.0f);
+ PxF32 pressureB = PxMax(particleB.density - params.initialDensity, 0.0f);
+ PxF32 pressureSum = pressureA + pressureB;
+#else
+ PxF32 pressureSum = PxMax(particleA.density + particleB.density - 2 * params.initialDensity, 0.0f);
+#endif
+
+ PxReal multiplierPressStd = (params.radiusSqStd * recipDistStd - 2 * params.radiusStd + distStd) *
+ params.stiffMulPressureMultiplierStd * pressureSum;
+
+ PxVec3 force = distVec * multiplierPressStd * params.scaleToStd;
+
+ // viscosity force
+ PxReal multiplierViscStd = radiusMinusDistStd * params.viscosityMultiplierStd;
+
+ PxVec3 vDiff = (particleB.velocity - particleA.velocity) * params.scaleToStd;
+ force += (vDiff * multiplierViscStd);
+
+ return force;
+}
+
+PX_FORCE_INLINE void addForce(PxVec3& particleForceDst, const Particle& particleDst, const Particle& particleSrc,
+ const PxF32 distSqr, const PxVec3& distVec, const DynamicsParameters& params)
+{
+ PX_ASSERT(distSqr <= params.cellSizeSq);
+ PxVec3 force = calcForce(particleDst, particleSrc, distSqr, distVec, params);
+ particleForceDst += (force * physx::intrinsics::recipFast(particleSrc.density));
+}
+
+PX_FORCE_INLINE void addForce_twoWay(PxVec3& particleAForce, PxVec3& particleBForce, const Particle& particleA,
+ const Particle& particleB, const PxF32 distSqr, const PxVec3& distVec,
+ const DynamicsParameters& params)
+{
+ PX_ASSERT(distSqr <= params.cellSizeSq);
+ PxVec3 force = calcForce(particleA, particleB, distSqr, distVec, params);
+ particleAForce += (force * physx::intrinsics::recipFast(particleB.density));
+ particleBForce -= (force * physx::intrinsics::recipFast(particleA.density));
+}
+
+#if REFERENCE_KERNELS
+
+PX_FORCE_INLINE void updateParticleGroupPair(PxVec3* __restrict forceBufA, PxVec3* __restrict forceBufB,
+ Particle* __restrict particlesSpA, Particle* __restrict particlesSpB,
+ const PxU32* __restrict particleIndicesSpA, const PxU32 numParticlesA,
+ const PxU32* __restrict particleIndicesSpB, const PxU32 numParticlesB,
+ const bool twoWayUpdate, const bool isDensityMode,
+ const DynamicsParameters& params, PxU8* tempSimdPositionBuffer,
+ PxU32* tempIndexStream)
+{
+ // Check given particle against particles of another cell.
+
+ for(PxU32 pA = 0; pA < numParticlesA; pA++)
+ {
+ PxU32 idxA = particleIndicesSpA[pA];
+ Particle& particleA = particlesSpA[idxA];
+ PxVec3& forceA = forceBufA[idxA];
+
+ for(PxU32 pB = 0; pB < numParticlesB; pB++)
+ {
+ PxU32 idxB = particleIndicesSpB[pB];
+ Particle& particleB = particlesSpB[idxB];
+ PxVec3& forceB = forceBufB[idxB];
+
+ PxVec3 distVec = particleA.position - particleB.position;
+ PxReal distSqr = distVec.magnitudeSquared();
+
+ if(distSqr < params.cellSizeSq && distSqr > 0.0f)
+ {
+ if(isDensityMode)
+ {
+ if(!twoWayUpdate)
+ addDensity(particleA, distSqr, params);
+ else
+ addDensity_twoWay(particleA, particleB, distSqr, params);
+ }
+ else
+ {
+ if(!twoWayUpdate)
+ addForce(forceA, particleA, particleB, distSqr, distVec, params);
+ else
+ addForce_twoWay(forceA, forceB, particleA, particleB, distSqr, distVec, params);
+ }
+ }
+ }
+ }
+}
+
+#else // REFERENCE_KERNELS
+
+class DensityPassType
+{
+};
+class ForcePassType
+{
+};
+class TwoWayUpdateType
+{
+};
+class OneWayUpdateType
+{
+};
+
+template <typename PassType, typename UpdateType>
+struct Contribution
+{
+};
+
+template <>
+struct Contribution<DensityPassType, TwoWayUpdateType>
+{
+ static void add(PxVec3&, PxVec3&, PxReal distSqr, const PxVec3&, Particle& particleA, Particle& particleB,
+ const DynamicsParameters& params)
+ {
+ addDensity_twoWay(particleA, particleB, distSqr, params);
+ }
+};
+
+template <>
+struct Contribution<ForcePassType, TwoWayUpdateType>
+{
+ static void add(PxVec3& forceA, PxVec3& forceB, PxReal distSqr, const PxVec3& distVec, Particle& particleA,
+ Particle& particleB, const DynamicsParameters& params)
+ {
+ addForce_twoWay(forceA, forceB, particleA, particleB, distSqr, distVec, params);
+ }
+};
+
+template <>
+struct Contribution<DensityPassType, OneWayUpdateType>
+{
+ static void add(PxVec3&, PxVec3&, PxReal distSqr, const PxVec3&, Particle& particleA, Particle&,
+ const DynamicsParameters& params)
+ {
+ addDensity(particleA, distSqr, params);
+ }
+};
+
+template <>
+struct Contribution<ForcePassType, OneWayUpdateType>
+{
+ static void add(PxVec3& forceA, PxVec3&, PxReal distSqr, const PxVec3& distVec, Particle& particleA,
+ Particle& particleB, const DynamicsParameters& params)
+ {
+ addForce(forceA, particleA, particleB, distSqr, distVec, params);
+ }
+};
+
+// Parameters for simd kernel execution
+struct DynamicsParametersSIMD
+{
+ Ps::aos::Vec4V scaleToStd;
+ Ps::aos::Vec4V scaleSqToStd;
+ Ps::aos::Vec4V radiusStd;
+ Ps::aos::Vec4V radiusSqStd;
+ Ps::aos::Vec4V densityMultiplierStd;
+ Ps::aos::Vec4V stiffMulPressureMultiplierStd;
+ Ps::aos::Vec4V viscosityMultiplierStd;
+ Ps::aos::Vec4V initialDensity;
+ Ps::aos::Vec4V stiffnessStd;
+};
+
+#if COMPILE_IN_SIMD_DENSITY
+
+PX_FORCE_INLINE void calcDensity4_onlyPtrs(Mat44V& posDensDstT, const Particle* __restrict pSrc0,
+ const Particle* __restrict pSrc1, const Particle* __restrict pSrc2,
+ const Particle* __restrict pSrc3, const DynamicsParametersSIMD& params)
+{
+ Ps::aos::Mat44V posDensSrc(V4LoadA(&pSrc0->position.x), V4LoadA(&pSrc1->position.x), V4LoadA(&pSrc2->position.x),
+ V4LoadA(&pSrc3->position.x));
+
+ Mat44V posDensSrcT = M44Trnsps(posDensSrc);
+
+ Vec4V distVec_x = V4Sub(posDensDstT.col0, posDensSrcT.col0);
+ Vec4V distVec_y = V4Sub(posDensDstT.col1, posDensSrcT.col1);
+ Vec4V distVec_z = V4Sub(posDensDstT.col2, posDensSrcT.col2);
+
+ Vec4V distSqr_x = V4Mul(distVec_x, distVec_x);
+ Vec4V distSqr_xy = V4MulAdd(distVec_y, distVec_y, distSqr_x);
+ Vec4V distSqr = V4MulAdd(distVec_z, distVec_z, distSqr_xy);
+
+ Vec4V distSqrStd = V4Mul(distSqr, params.scaleSqToStd);
+
+ Vec4V radius2MinusDist2Std = V4Sub(params.radiusSqStd, distSqrStd);
+ Vec4V densityStd = V4Mul(params.densityMultiplierStd, radius2MinusDist2Std);
+ densityStd = V4Mul(densityStd, radius2MinusDist2Std);
+ densityStd = V4Mul(densityStd, radius2MinusDist2Std);
+
+ posDensDstT.col3 = V4Add(posDensDstT.col3, densityStd);
+}
+
+PX_FORCE_INLINE void calcDensity4_twoWay_onlyPtrs(Mat44V& posDensDstT, Particle* __restrict pSrc0,
+ Particle* __restrict pSrc1, Particle* __restrict pSrc2,
+ Particle* __restrict pSrc3, const DynamicsParametersSIMD& params)
+{
+ Mat44V posDensSrc(V4LoadA(&pSrc0->position.x), V4LoadA(&pSrc1->position.x), V4LoadA(&pSrc2->position.x),
+ V4LoadA(&pSrc3->position.x));
+
+ Mat44V posDensSrcT = M44Trnsps(posDensSrc);
+
+ Vec4V distVec_x = V4Sub(posDensDstT.col0, posDensSrcT.col0);
+ Vec4V distVec_y = V4Sub(posDensDstT.col1, posDensSrcT.col1);
+ Vec4V distVec_z = V4Sub(posDensDstT.col2, posDensSrcT.col2);
+
+ Vec4V distSqr_x = V4Mul(distVec_x, distVec_x);
+ Vec4V distSqr_xy = V4MulAdd(distVec_y, distVec_y, distSqr_x);
+ Vec4V distSqr = V4MulAdd(distVec_z, distVec_z, distSqr_xy);
+
+ Vec4V distSqrStd = V4Mul(distSqr, params.scaleSqToStd);
+
+ Vec4V radius2MinusDist2Std = V4Sub(params.radiusSqStd, distSqrStd);
+ Vec4V densityStd = V4Mul(params.densityMultiplierStd, radius2MinusDist2Std);
+ densityStd = V4Mul(densityStd, radius2MinusDist2Std);
+ densityStd = V4Mul(densityStd, radius2MinusDist2Std);
+
+ // apply to srcParticles (sschirm TOTO rename)
+ PX_ALIGN(16, PxVec4 density);
+ V4StoreA(densityStd, &density[0]);
+ pSrc0->density += density[0];
+ pSrc1->density += density[1];
+ pSrc2->density += density[2];
+ pSrc3->density += density[3];
+
+ // apply to dstParticle (sschirm TOTO rename)
+ posDensDstT.col3 = V4Add(posDensDstT.col3, densityStd);
+}
+
+#endif // COMPILE_IN_SIMD_DENSITY
+
+PX_FORCE_INLINE void calcForce4_onlyPtrs(Mat44V& forceDstT, const Particle* __restrict pSrc0,
+ const Particle* __restrict pSrc1, const Particle* __restrict pSrc2,
+ const Particle* __restrict pSrc3, const Mat44V& posDensDstT,
+ const Mat44V& velPressDstT, const DynamicsParametersSIMD& params)
+{
+ Mat44V posDensSrc(V4LoadA(&pSrc0->position.x), V4LoadA(&pSrc1->position.x), V4LoadA(&pSrc2->position.x),
+ V4LoadA(&pSrc3->position.x));
+
+ Mat44V posDensSrcT = M44Trnsps(posDensSrc);
+
+ Vec4V distVec_x = V4Sub(posDensDstT.col0, posDensSrcT.col0);
+ Vec4V distVec_y = V4Sub(posDensDstT.col1, posDensSrcT.col1);
+ Vec4V distVec_z = V4Sub(posDensDstT.col2, posDensSrcT.col2);
+
+ Vec4V distSqr_x = V4Mul(distVec_x, distVec_x);
+ Vec4V distSqr_xy = V4MulAdd(distVec_y, distVec_y, distSqr_x);
+ Vec4V distSqr = V4MulAdd(distVec_z, distVec_z, distSqr_xy);
+
+ Vec4V distSqrStd = V4Mul(distSqr, params.scaleSqToStd);
+
+ Vec4V recipDistStd = V4RsqrtFast(distSqrStd);
+ Vec4V distStd = V4Mul(distSqrStd, recipDistStd);
+ Vec4V radiusMinusDistStd = V4Sub(params.radiusStd, distStd);
+
+ // pressure force
+ Mat44V velPressSrc(V4LoadA(&pSrc0->velocity.x), V4LoadA(&pSrc1->velocity.x), V4LoadA(&pSrc2->velocity.x),
+ V4LoadA(&pSrc3->velocity.x));
+
+ Mat44V velPressSrcT = M44Trnsps(velPressSrc);
+
+ Vec4V pressureDst = V4Sub(posDensDstT.col3, params.initialDensity);
+ Vec4V pressureSrc = V4Sub(posDensSrcT.col3, params.initialDensity);
+#if PRESSURE_ORIGIN
+ pressureDst = V4Max(pressureDst, V4Zero());
+ pressureSrc = V4Max(pressureSrc, V4Zero());
+ Vec4V pressureSum = V4Add(pressureDst, pressureSrc);
+#else
+ Vec4V pressureSum = V4Add(pressureDst, pressureSrc);
+ pressureSum = V4Max(pressureSum, V4Zero());
+#endif
+
+ Vec4V radiusStd_x2 = V4Add(params.radiusStd, params.radiusStd);
+ Vec4V multiplierPressStd = V4MulAdd(params.radiusSqStd, recipDistStd, distStd);
+ multiplierPressStd = V4Sub(multiplierPressStd, radiusStd_x2);
+ multiplierPressStd = V4Mul(multiplierPressStd, params.stiffMulPressureMultiplierStd);
+ multiplierPressStd = V4Mul(multiplierPressStd, pressureSum);
+
+ Vec4V pressureForceMult = V4Mul(multiplierPressStd, params.scaleToStd);
+ Vec4V force_x = V4Mul(distVec_x, pressureForceMult);
+ Vec4V force_y = V4Mul(distVec_y, pressureForceMult);
+ Vec4V force_z = V4Mul(distVec_z, pressureForceMult);
+
+ // viscosity force
+ Vec4V multiplierViscStd = V4Mul(radiusMinusDistStd, params.viscosityMultiplierStd);
+
+ Vec4V viscossityForceMult = V4Mul(params.scaleToStd, multiplierViscStd);
+
+ Vec4V vDiff_x = V4Sub(velPressSrcT.col0, velPressDstT.col0);
+ Vec4V vDiff_y = V4Sub(velPressSrcT.col1, velPressDstT.col1);
+ Vec4V vDiff_z = V4Sub(velPressSrcT.col2, velPressDstT.col2);
+
+ force_x = V4MulAdd(vDiff_x, viscossityForceMult, force_x);
+ force_y = V4MulAdd(vDiff_y, viscossityForceMult, force_y);
+ force_z = V4MulAdd(vDiff_z, viscossityForceMult, force_z);
+
+ // application of force
+ Vec4V invDensities = V4RecipFast(posDensSrcT.col3);
+ force_x = V4Mul(force_x, invDensities);
+ force_y = V4Mul(force_y, invDensities);
+ force_z = V4Mul(force_z, invDensities);
+
+ forceDstT.col0 = V4Add(forceDstT.col0, force_x);
+ forceDstT.col1 = V4Add(forceDstT.col1, force_y);
+ forceDstT.col2 = V4Add(forceDstT.col2, force_z);
+}
+
+PX_FORCE_INLINE void calcForce4_twoWay_onlyPtrs(Mat44V& forceDstT, Mat44V& forceSrcT, Particle* __restrict pSrc0,
+ Particle* __restrict pSrc1, Particle* __restrict pSrc2,
+ Particle* __restrict pSrc3, const Mat44V& posDensDstT,
+ const Mat44V& velPressDstT, const Vec4V& invDensityDst,
+ const DynamicsParametersSIMD& params)
+{
+ Mat44V posDensSrc(V4LoadA(&pSrc0->position.x), V4LoadA(&pSrc1->position.x), V4LoadA(&pSrc2->position.x),
+ V4LoadA(&pSrc3->position.x));
+
+ Mat44V posDensSrcT = M44Trnsps(posDensSrc);
+
+ Vec4V distVec_x = V4Sub(posDensDstT.col0, posDensSrcT.col0);
+ Vec4V distVec_y = V4Sub(posDensDstT.col1, posDensSrcT.col1);
+ Vec4V distVec_z = V4Sub(posDensDstT.col2, posDensSrcT.col2);
+
+ Vec4V distSqr_x = V4Mul(distVec_x, distVec_x);
+ Vec4V distSqr_xy = V4MulAdd(distVec_y, distVec_y, distSqr_x);
+ Vec4V distSqr = V4MulAdd(distVec_z, distVec_z, distSqr_xy);
+
+ Vec4V distSqrStd = V4Mul(distSqr, params.scaleSqToStd);
+
+ Vec4V recipDistStd = V4RsqrtFast(distSqrStd);
+ Vec4V distStd = V4Mul(distSqrStd, recipDistStd);
+ Vec4V radiusMinusDistStd = V4Sub(params.radiusStd, distStd);
+
+ // pressure force
+ Mat44V velPressSrc(V4LoadA(&pSrc0->velocity.x), V4LoadA(&pSrc1->velocity.x), V4LoadA(&pSrc2->velocity.x),
+ V4LoadA(&pSrc3->velocity.x));
+
+ Mat44V velPressSrcT = M44Trnsps(velPressSrc);
+
+ Vec4V pressureDst = V4Sub(posDensDstT.col3, params.initialDensity);
+ Vec4V pressureSrc = V4Sub(posDensSrcT.col3, params.initialDensity);
+#if PRESSURE_ORIGIN
+ pressureDst = V4Max(pressureDst, V4Zero());
+ pressureSrc = V4Max(pressureSrc, V4Zero());
+ Vec4V pressureSum = V4Add(pressureDst, pressureSrc);
+#else
+ Vec4V pressureSum = V4Add(pressureDst, pressureSrc);
+ pressureSum = V4Max(pressureSum, V4Zero());
+#endif
+
+ Vec4V radiusStd_x2 = V4Add(params.radiusStd, params.radiusStd);
+ Vec4V multiplierPressStd = V4MulAdd(params.radiusSqStd, recipDistStd, distStd);
+ multiplierPressStd = V4Sub(multiplierPressStd, radiusStd_x2);
+ multiplierPressStd = V4Mul(multiplierPressStd, params.stiffMulPressureMultiplierStd);
+ multiplierPressStd = V4Mul(multiplierPressStd, pressureSum);
+
+ Vec4V pressureForceMult = V4Mul(multiplierPressStd, params.scaleToStd);
+ Vec4V force_x = V4Mul(distVec_x, pressureForceMult);
+ Vec4V force_y = V4Mul(distVec_y, pressureForceMult);
+ Vec4V force_z = V4Mul(distVec_z, pressureForceMult);
+
+ // viscosity force
+ Vec4V multiplierViscStd = V4Mul(radiusMinusDistStd, params.viscosityMultiplierStd);
+
+ Vec4V viscossityForceMult = V4Mul(params.scaleToStd, multiplierViscStd);
+
+ Vec4V vDiff_x = V4Sub(velPressSrcT.col0, velPressDstT.col0);
+ Vec4V vDiff_y = V4Sub(velPressSrcT.col1, velPressDstT.col1);
+ Vec4V vDiff_z = V4Sub(velPressSrcT.col2, velPressDstT.col2);
+
+ force_x = V4MulAdd(vDiff_x, viscossityForceMult, force_x);
+ force_y = V4MulAdd(vDiff_y, viscossityForceMult, force_y);
+ force_z = V4MulAdd(vDiff_z, viscossityForceMult, force_z);
+
+ // apply to src particles (sschirm TODO:rename)
+ forceSrcT.col0 = V4NegMulSub(force_x, invDensityDst, forceSrcT.col0);
+ forceSrcT.col1 = V4NegMulSub(force_y, invDensityDst, forceSrcT.col1);
+ forceSrcT.col2 = V4NegMulSub(force_z, invDensityDst, forceSrcT.col2);
+
+ // apply to dst particle (sschirm TODO:rename)
+ Vec4V invDensities = V4RecipFast(posDensSrcT.col3);
+ forceDstT.col0 = V4MulAdd(force_x, invDensities, forceDstT.col0);
+ forceDstT.col1 = V4MulAdd(force_y, invDensities, forceDstT.col1);
+ forceDstT.col2 = V4MulAdd(force_z, invDensities, forceDstT.col2);
+}
+
+#if !PX_IOS
+
+static void updateStreamDensity(Particle* __restrict particlesA, const Particle* __restrict particlesB,
+ const PxU32* indexStream, const PxU32 indexStreamSize, const DynamicsParameters& params,
+ const DynamicsParametersSIMD& simdParams)
+{
+ PX_UNUSED(simdParams);
+ PxU32 s = 0;
+ while(s < indexStreamSize)
+ {
+ PxU32 dstIdx = indexStream[s++];
+ PxU32 numInteractions = indexStream[s++];
+
+ // the simd density code is currently disabled, since it's not a real win
+ if(1)
+ {
+ for(PxU32 i = 0; i < numInteractions; ++i)
+ {
+ PxU32 srcIdx = indexStream[s++];
+ PX_ALIGN(16, PxVec3 distVec) = particlesA[dstIdx].position - particlesB[srcIdx].position;
+ PxF32 distSqr = distVec.magnitudeSquared();
+ addDensity(particlesA[dstIdx], distSqr, params);
+ }
+ }
+#if COMPILE_IN_SIMD_DENSITY
+ else
+ {
+ Particle* __restrict dstParticle = particlesA + dstIdx;
+ PxU32 blockCount = numInteractions / 4;
+
+ if(blockCount > 0)
+ {
+ Vec4V tmp = V4LoadA(&dstParticle->position.x);
+ Mat44V posDensDst(tmp, tmp, tmp, tmp);
+ Mat44V posDensDstT = M44Trnsps(posDensDst);
+
+ // set density to zero
+ posDensDstT.col3 = V4Zero();
+
+ for(PxU32 i = 0; i < blockCount; ++i)
+ {
+ PxU32 srcIdx0 = indexStream[s++];
+ PxU32 srcIdx1 = indexStream[s++];
+ PxU32 srcIdx2 = indexStream[s++];
+ PxU32 srcIdx3 = indexStream[s++];
+
+ calcDensity4_onlyPtrs(posDensDstT, particlesB + srcIdx0, particlesB + srcIdx1, particlesB + srcIdx2,
+ particlesB + srcIdx3, simdParams);
+ }
+
+ // simd to scalar
+ PX_ALIGN(16, PxVec4 density);
+ V4StoreA(posDensDstT.col3, &density[0]);
+ dstParticle->density += density[0] + density[1] + density[2] + density[3];
+ }
+
+ PxU32 numLeft = numInteractions - blockCount * 4;
+ for(PxU32 i = 0; i < numLeft; ++i)
+ {
+ PxU32 srcIdx = indexStream[s++];
+
+ PX_ALIGN(16, PxVec3) distVec = particlesA[dstIdx].position - particlesB[srcIdx].position;
+ PxF32 distSqr = distVec.magnitudeSquared();
+ addDensity(particlesA[dstIdx], distSqr, params);
+ }
+ }
+#endif // COMPILE_IN_SIMD_DENSITY
+ }
+}
+
+static void updateStreamDensityTwoWay(Particle* __restrict particlesA, Particle* __restrict particlesB,
+ const PxU32* indexStream, const PxU32 indexStreamSize,
+ const DynamicsParameters& params, const DynamicsParametersSIMD& simdParams)
+{
+ PX_UNUSED(simdParams);
+ PxU32 s = 0;
+ while(s < indexStreamSize)
+ {
+ PxU32 dstIdx = indexStream[s++];
+ PxU32 numInteractions = indexStream[s++];
+
+ // the simd density code is currently disabled, since it's not a real win
+ if(1)
+ {
+ for(PxU32 i = 0; i < numInteractions; ++i)
+ {
+ PxU32 srcIdx = indexStream[s++];
+ PX_ALIGN(16, PxVec3) distVec = particlesA[dstIdx].position - particlesB[srcIdx].position;
+ PxF32 distSqr = distVec.magnitudeSquared();
+ addDensity_twoWay(particlesA[dstIdx], particlesB[srcIdx], distSqr, params);
+ }
+ }
+#if COMPILE_IN_SIMD_DENSITY
+ else
+ {
+ Particle* __restrict dstParticle = particlesA + dstIdx;
+ PxU32 blockCount = numInteractions / 4;
+
+ if(blockCount > 0)
+ {
+ Vec4V tmp = V4LoadA(&dstParticle->position.x);
+ Mat44V posDensDst(tmp, tmp, tmp, tmp);
+ Mat44V posDensDstT = M44Trnsps(posDensDst);
+
+ // set density to zero
+ posDensDstT.col3 = V4Zero();
+
+ for(PxU32 i = 0; i < blockCount; ++i)
+ {
+ PxU32 srcIdx0 = indexStream[s++];
+ PxU32 srcIdx1 = indexStream[s++];
+ PxU32 srcIdx2 = indexStream[s++];
+ PxU32 srcIdx3 = indexStream[s++];
+
+ calcDensity4_twoWay_onlyPtrs(posDensDstT, particlesB + srcIdx0, particlesB + srcIdx1,
+ particlesB + srcIdx2, particlesB + srcIdx3, simdParams);
+ }
+
+ // simd to scalar
+ PX_ALIGN(16, PxVec4 density);
+ V4StoreA(posDensDstT.col3, &density[0]);
+ dstParticle->density += density[0] + density[1] + density[2] + density[3];
+ }
+
+ PxU32 numLeft = numInteractions - blockCount * 4;
+ for(PxU32 i = 0; i < numLeft; ++i)
+ {
+ PxU32 srcIdx = indexStream[s++];
+
+ PX_ALIGN(16, PxVec3 distVec) = particlesA[dstIdx].position - particlesB[srcIdx].position;
+ PxF32 distSqr = distVec.magnitudeSquared();
+ addDensity_twoWay(particlesA[dstIdx], particlesB[srcIdx], distSqr, params);
+ }
+ }
+#endif // COMPILE_IN_SIMD_DENSITY
+ }
+}
+
+static void updateStreamForce(PxVec3* __restrict forceBufA, Particle* __restrict particlesA,
+ const Particle* __restrict particlesB, const PxU32* indexStream,
+ const PxU32 indexStreamSize, const DynamicsParameters& params,
+ const DynamicsParametersSIMD& simdParams)
+{
+ PxU32 s = 0;
+ while(s < indexStreamSize)
+ {
+ PxU32 dstIdx = indexStream[s++];
+ Particle* __restrict dstParticle = particlesA + dstIdx;
+
+ PxU32 numInteractions = indexStream[s++];
+ PxU32 blockCount = numInteractions / 4;
+
+ if(blockCount > 0)
+ {
+ Vec4V tmp = V4LoadA(&dstParticle->position.x);
+ Mat44V posDensDst(tmp, tmp, tmp, tmp);
+ Mat44V posDensDstT = M44Trnsps(posDensDst);
+
+ Mat44V forceDstT(V4Zero(), V4Zero(), V4Zero(), V4Zero());
+
+ tmp = V4LoadA(&dstParticle->velocity.x);
+ Mat44V velPressDst(tmp, tmp, tmp, tmp);
+ Mat44V velPressDstT = M44Trnsps(velPressDst);
+
+ for(PxU32 i = 0; i < blockCount; ++i)
+ {
+ PxU32 srcIdx0 = indexStream[s++];
+ PxU32 srcIdx1 = indexStream[s++];
+ PxU32 srcIdx2 = indexStream[s++];
+ PxU32 srcIdx3 = indexStream[s++];
+
+ calcForce4_onlyPtrs(forceDstT, particlesB + srcIdx0, particlesB + srcIdx1, particlesB + srcIdx2,
+ particlesB + srcIdx3, posDensDstT, velPressDstT, simdParams);
+ }
+
+ // simd to scalar
+ Mat44V forceDst = M44Trnsps(forceDstT);
+ Vec4V forceTmp1 = V4Add(forceDst.col0, forceDst.col1);
+ Vec4V forceTmp2 = V4Add(forceDst.col2, forceDst.col3);
+ forceTmp1 = V4Add(forceTmp1, forceTmp2);
+ forceBufA[dstIdx] += V4ReadXYZ(forceTmp1);
+ }
+
+ PxU32 numLeft = numInteractions - blockCount * 4;
+ for(PxU32 i = 0; i < numLeft; ++i)
+ {
+ PxU32 srcIdx = indexStream[s++];
+
+ PX_ALIGN(16, PxVec3 distVec) = particlesA[dstIdx].position - particlesB[srcIdx].position;
+ PxF32 distSqr = distVec.magnitudeSquared();
+ addForce(forceBufA[dstIdx], particlesA[dstIdx], particlesB[srcIdx], distSqr, distVec, params);
+ }
+ }
+}
+
+static void updateStreamForceTwoWay(PxVec3* __restrict forceBufA, PxVec3* __restrict forceBufB,
+ Particle* __restrict particlesA, Particle* __restrict particlesB,
+ const PxU32* indexStream, const PxU32 indexStreamSize,
+ const DynamicsParameters& params, const DynamicsParametersSIMD& simdParams)
+{
+ PX_ASSERT(forceBufB);
+ PxU32 s = 0;
+ while(s < indexStreamSize)
+ {
+ PxU32 dstIdx = indexStream[s++];
+ Particle* __restrict dstParticle = particlesA + dstIdx;
+
+ PxU32 numInteractions = indexStream[s++];
+ PxU32 blockCount = numInteractions / 4;
+
+ if(blockCount > 0)
+ {
+ Vec4V tmp = V4LoadA(&dstParticle->position.x);
+ Mat44V posDensDst(tmp, tmp, tmp, tmp);
+ Mat44V posDensDstT = M44Trnsps(posDensDst);
+
+ Mat44V forceDstT(V4Zero(), V4Zero(), V4Zero(), V4Zero());
+
+ tmp = V4LoadA(&dstParticle->velocity.x);
+ Mat44V velPressDst(tmp, tmp, tmp, tmp);
+ Mat44V velPressDstT = M44Trnsps(velPressDst);
+
+ tmp = V4Load(dstParticle->density);
+ Vec4V invDensityA = V4RecipFast(tmp);
+
+ for(PxU32 i = 0; i < blockCount; ++i)
+ {
+ PxU32 srcIdx0 = indexStream[s++];
+ PxU32 srcIdx1 = indexStream[s++];
+ PxU32 srcIdx2 = indexStream[s++];
+ PxU32 srcIdx3 = indexStream[s++];
+
+ Vec4V tmp0 = Vec4V_From_Vec3V(V3LoadU(&forceBufB[srcIdx0].x));
+ Vec4V tmp1 = Vec4V_From_Vec3V(V3LoadU(&forceBufB[srcIdx1].x));
+ Vec4V tmp2 = Vec4V_From_Vec3V(V3LoadU(&forceBufB[srcIdx2].x));
+ Vec4V tmp3 = Vec4V_From_Vec3V(V3LoadU(&forceBufB[srcIdx3].x));
+ Mat44V forceSrc(tmp0, tmp1, tmp2, tmp3);
+ Mat44V forceSrcT = M44Trnsps(forceSrc);
+
+ calcForce4_twoWay_onlyPtrs(forceDstT, forceSrcT, particlesB + srcIdx0, particlesB + srcIdx1,
+ particlesB + srcIdx2, particlesB + srcIdx3, posDensDstT, velPressDstT,
+ invDensityA, simdParams);
+
+ forceSrc = M44Trnsps(forceSrcT);
+ forceBufB[srcIdx0] = V4ReadXYZ(forceSrc.col0);
+ forceBufB[srcIdx1] = V4ReadXYZ(forceSrc.col1);
+ forceBufB[srcIdx2] = V4ReadXYZ(forceSrc.col2);
+ forceBufB[srcIdx3] = V4ReadXYZ(forceSrc.col3);
+ }
+
+ // simd to scalar
+ Mat44V forceDst = M44Trnsps(forceDstT);
+ Vec4V forceTmp1 = V4Add(forceDst.col0, forceDst.col1);
+ Vec4V forceTmp2 = V4Add(forceDst.col2, forceDst.col3);
+ forceTmp1 = V4Add(forceTmp1, forceTmp2);
+ forceBufA[dstIdx] += V4ReadXYZ(forceTmp1);
+ }
+
+ PxU32 numLeft = numInteractions - blockCount * 4;
+ for(PxU32 i = 0; i < numLeft; ++i)
+ {
+ PxU32 srcIdx = indexStream[s++];
+
+ PX_ALIGN(16, PxVec3 distVec) = particlesA[dstIdx].position - particlesB[srcIdx].position;
+ PxF32 distSqr = distVec.magnitudeSquared();
+ addForce_twoWay(forceBufA[dstIdx], forceBufB[srcIdx], particlesA[dstIdx], particlesB[srcIdx], distSqr,
+ distVec, params);
+ }
+ }
+}
+
+#endif // !PX_IOS
+
+template <typename PassType, typename UpdateType>
+PX_FORCE_INLINE_KERNELS static void updateParticleGroupPair_small_template(
+ PxVec3* __restrict forceBufA, PxVec3* __restrict forceBufB, Particle* __restrict particlesA,
+ Particle* __restrict particlesB, const PxU32* __restrict particleIndicesA, const PxU32 numParticlesA,
+ const PxU32* __restrict particleIndicesB, const PxU32 numParticlesB, const DynamicsParameters& params)
+{
+ PxU32 num_loopB = 4 * (numParticlesB / 4);
+ PxU32 u_cellSizeSq = PxUnionCast<PxU32, PxF32>(params.cellSizeSq);
+
+ for(PxU32 pA = 0; pA < numParticlesA; pA++)
+ {
+ PxU32 idxA = particleIndicesA[pA];
+ Particle& particleA = particlesA[idxA];
+ PxVec3& forceA = forceBufA[idxA];
+
+ for(PxU32 pB = 0; pB < num_loopB; pB += 4)
+ {
+ PxU32 idxB0 = particleIndicesB[pB];
+ PxU32 idxB1 = particleIndicesB[pB + 1];
+ PxU32 idxB2 = particleIndicesB[pB + 2];
+ PxU32 idxB3 = particleIndicesB[pB + 3];
+
+ Particle& particleB0 = particlesB[idxB0];
+ Particle& particleB1 = particlesB[idxB1];
+ Particle& particleB2 = particlesB[idxB2];
+ Particle& particleB3 = particlesB[idxB3];
+
+ PxVec3& forceB0 = forceBufB[idxB0];
+ PxVec3& forceB1 = forceBufB[idxB1];
+ PxVec3& forceB2 = forceBufB[idxB2];
+ PxVec3& forceB3 = forceBufB[idxB3];
+
+ PX_ALIGN(16, PxVec3 distVec0) = particleA.position - particleB0.position;
+ PX_ALIGN(16, PxVec3 distVec1) = particleA.position - particleB1.position;
+ PX_ALIGN(16, PxVec3 distVec2) = particleA.position - particleB2.position;
+ PX_ALIGN(16, PxVec3 distVec3) = particleA.position - particleB3.position;
+
+ PxReal distSqr0 = distVec0.magnitudeSquared();
+ PxReal distSqr1 = distVec1.magnitudeSquared();
+ PxReal distSqr2 = distVec2.magnitudeSquared();
+ PxReal distSqr3 = distVec3.magnitudeSquared();
+
+ // marginally faster to do that test (not as good as in brute force)
+ PxF32 isec = physx::intrinsics::fsel(params.cellSizeSq - distSqr0, 1.0f, 0.0f);
+ isec = physx::intrinsics::fsel(params.cellSizeSq - distSqr1, 1.0f, isec);
+ isec = physx::intrinsics::fsel(params.cellSizeSq - distSqr2, 1.0f, isec);
+ isec = physx::intrinsics::fsel(params.cellSizeSq - distSqr3, 1.0f, isec);
+
+ if(isec == 0.0f)
+ continue;
+
+ PxU32 u_distSqr0 = PxUnionCast<PxU32, PxReal>(distSqr0);
+ PxU32 u_distSqr1 = PxUnionCast<PxU32, PxReal>(distSqr1);
+ PxU32 u_distSqr2 = PxUnionCast<PxU32, PxReal>(distSqr2);
+ PxU32 u_distSqr3 = PxUnionCast<PxU32, PxReal>(distSqr3);
+
+ if(u_distSqr0 < u_cellSizeSq && u_distSqr0 > 0)
+ {
+ Contribution<PassType, UpdateType>::add(forceA, forceB0, distSqr0, distVec0, particleA, particleB0,
+ params);
+ }
+ if(u_distSqr1 < u_cellSizeSq && u_distSqr1 > 0)
+ {
+ Contribution<PassType, UpdateType>::add(forceA, forceB1, distSqr1, distVec1, particleA, particleB1,
+ params);
+ }
+ if(u_distSqr2 < u_cellSizeSq && u_distSqr2 > 0)
+ {
+ Contribution<PassType, UpdateType>::add(forceA, forceB2, distSqr2, distVec2, particleA, particleB2,
+ params);
+ }
+ if(u_distSqr3 < u_cellSizeSq && u_distSqr3 > 0)
+ {
+ Contribution<PassType, UpdateType>::add(forceA, forceB3, distSqr3, distVec3, particleA, particleB3,
+ params);
+ }
+ }
+
+ for(PxU32 pB = num_loopB; pB < numParticlesB; pB++)
+ {
+ PxU32 idxB = particleIndicesB[pB];
+ Particle& particleB = particlesB[idxB];
+ PxVec3& forceB = forceBufB[idxB];
+
+ PX_ALIGN(16, PxVec3 distVec) = particleA.position - particleB.position;
+
+ PxReal distSqr = distVec.magnitudeSquared();
+ PxU32 u_distSqr = PxUnionCast<PxU32, PxReal>(distSqr);
+
+ if(u_distSqr < u_cellSizeSq && u_distSqr > 0)
+ {
+ Contribution<PassType, UpdateType>::add(forceA, forceB, distSqr, distVec, particleA, particleB, params);
+ }
+ }
+ }
+}
+
+#if !PX_IOS
+/**
+particlesA, particlesB, particleIndicesA, particleIndicesB are guaranteed to be non-overlapping
+*/
+static void updateParticleGroupPair_simd_template(PxVec3* forceBufA, PxVec3* forceBufB, Particle* particlesA,
+ Particle* particlesB, const PxU32* particleIndicesA,
+ const PxU32 numParticlesA, const PxU32* particleIndicesB,
+ const PxU32 numParticlesB, const DynamicsParameters& params,
+ const bool isDensityMode, const bool twoWayUpdate,
+ Vec4V* tempSimdPositionBuffer, PxU32* tempIndexStream)
+{
+ PxU32 numParticles4B = ((numParticlesB + 3) & ~0x3) + 4; // ceil up to multiple of four + 4 for save unrolling
+
+ PX_ALIGN(16, Particle fakeParticle);
+ fakeParticle.position = PxVec3(FLT_MAX, FLT_MAX, FLT_MAX);
+ fakeParticle.density = FLT_MAX; // avoid uninitialized access by V4LoadA
+
+ const PxU32* __restrict idxB = particleIndicesB;
+ const PxU32* __restrict idxBEnd = particleIndicesB + numParticlesB;
+ for(PxU32 q = 0, v = 0; q < numParticles4B; q += 4, idxB += 4, v += 3)
+ {
+ const Particle* prtB0 = (q < numParticlesB) ? particlesB + *(idxB) : &fakeParticle;
+ const Particle* prtB1 = (q + 1 < numParticlesB) ? particlesB + *(idxB + 1) : &fakeParticle;
+ const Particle* prtB2 = (q + 2 < numParticlesB) ? particlesB + *(idxB + 2) : &fakeParticle;
+ const Particle* prtB3 = (q + 3 < numParticlesB) ? particlesB + *(idxB + 3) : &fakeParticle;
+
+ Mat44V posDensB_N(V4LoadA(&prtB0->position.x), V4LoadA(&prtB1->position.x), V4LoadA(&prtB2->position.x),
+ V4LoadA(&prtB3->position.x));
+ Mat44V posDensTB_N = M44Trnsps(posDensB_N);
+
+ tempSimdPositionBuffer[v] = posDensTB_N.col0;
+ tempSimdPositionBuffer[v + 1] = posDensTB_N.col1;
+ tempSimdPositionBuffer[v + 2] = posDensTB_N.col2;
+ }
+
+ DynamicsParametersSIMD simdParams;
+ simdParams.scaleToStd = V4Load(params.scaleToStd);
+ simdParams.scaleSqToStd = V4Load(params.scaleSqToStd);
+ simdParams.radiusStd = V4Load(params.radiusStd);
+ simdParams.radiusSqStd = V4Load(params.radiusSqStd);
+ simdParams.densityMultiplierStd = V4Load(params.densityMultiplierStd);
+ simdParams.stiffMulPressureMultiplierStd = V4Load(params.stiffMulPressureMultiplierStd);
+ simdParams.viscosityMultiplierStd = V4Load(params.viscosityMultiplierStd);
+ simdParams.initialDensity = V4Load(params.initialDensity);
+ Vec4V simdCellSizeSq = V4Load(params.cellSizeSq);
+ VecU32V simdIntOne = U4LoadXYZW(1, 1, 1, 1);
+ VecU32V simdIntZero = U4LoadXYZW(0, 0, 0, 0);
+
+ PxU32 indexStreamSize = 0;
+ const PxU32* __restrict idxA = particleIndicesA;
+ for(PxU32 p = 0; p < numParticlesA; p++, idxA++)
+ {
+ Particle* __restrict prtA = particlesA + *idxA;
+
+ PX_ASSERT(MAX_INDEX_STREAM_SIZE - indexStreamSize >= 2);
+ tempIndexStream[indexStreamSize++] = *idxA;
+
+ PxU32* interactionCountPtr = tempIndexStream + indexStreamSize++;
+ PxU32 indexStreamSizeOld = indexStreamSize;
+
+ PX_ALIGN(16, PxU32 isecs[8]);
+ idxB = particleIndicesB;
+
+ Vec4V tmp = V4LoadA(&prtA->position.x);
+ Mat44V posDensA(tmp, tmp, tmp, tmp);
+ Mat44V posDensTA = M44Trnsps(posDensA);
+
+ const Vec4V* prtB = tempSimdPositionBuffer;
+ Vec4V posT0B = *prtB++;
+ Vec4V posT1B = *prtB++;
+ Vec4V posT2B = *prtB++;
+ Vec4V distVec_x = V4Sub(posDensTA.col0, posT0B);
+ Vec4V distVec_y = V4Sub(posDensTA.col1, posT1B);
+ Vec4V distVec_z = V4Sub(posDensTA.col2, posT2B);
+ Vec4V distSqr_x = V4Mul(distVec_x, distVec_x);
+ Vec4V distSqr_xy = V4MulAdd(distVec_y, distVec_y, distSqr_x);
+ Vec4V distSqr = V4MulAdd(distVec_z, distVec_z, distSqr_xy);
+ BoolV isec_b = V4IsGrtr(simdCellSizeSq, distSqr);
+ isec_b = BAnd(isec_b, V4IsGrtr(distSqr, V4Zero()));
+ VecU32V isec = V4U32Sel(isec_b, simdIntOne, simdIntZero);
+
+ U4StoreA(isec, isecs);
+
+ for(PxU32 q = 0; q < numParticlesB; q += 4, idxB += 4)
+ {
+ Vec4V posT0B_N = *prtB++;
+ Vec4V posT1B_N = *prtB++;
+ Vec4V posT2B_N = *prtB++;
+ Vec4V distVec_x_N = V4Sub(posDensTA.col0, posT0B_N);
+ Vec4V distVec_y_N = V4Sub(posDensTA.col1, posT1B_N);
+ Vec4V distVec_z_N = V4Sub(posDensTA.col2, posT2B_N);
+ Vec4V distSqr_x_N = V4Mul(distVec_x_N, distVec_x_N);
+ Vec4V distSqr_xy_N = V4MulAdd(distVec_y_N, distVec_y_N, distSqr_x_N);
+ Vec4V distSqr_N = V4MulAdd(distVec_z_N, distVec_z_N, distSqr_xy_N);
+ BoolV isec_b_N = V4IsGrtr(simdCellSizeSq, distSqr_N);
+ isec_b_N = BAnd(isec_b_N, V4IsGrtr(distSqr_N, V4Zero()));
+ VecU32V isec_N = V4U32Sel(isec_b_N, simdIntOne, simdIntZero);
+
+ PxU32 base_write_index = (q + 4) & 7;
+ U4StoreA(isec_N, isecs + base_write_index);
+
+ PxU32 base_read_index = q & 7;
+ PxU32 u_isec0 = isecs[base_read_index];
+ PxU32 u_isec1 = isecs[base_read_index + 1];
+ PxU32 u_isec2 = isecs[base_read_index + 2];
+ PxU32 u_isec3 = isecs[base_read_index + 3];
+
+ PX_ASSERT(MAX_INDEX_STREAM_SIZE - indexStreamSize >= 4);
+
+ PX_ASSERT(indexStreamSize < MAX_INDEX_STREAM_SIZE);
+ PX_ASSERT(idxB < idxBEnd);
+ tempIndexStream[indexStreamSize] = *(idxB);
+ indexStreamSize += u_isec0;
+
+ PX_ASSERT(indexStreamSize < MAX_INDEX_STREAM_SIZE);
+ tempIndexStream[indexStreamSize] = ((idxB + 1) < idxBEnd) ? *(idxB + 1) : 0;
+ indexStreamSize += u_isec1;
+
+ PX_ASSERT(indexStreamSize < MAX_INDEX_STREAM_SIZE);
+ tempIndexStream[indexStreamSize] = ((idxB + 2) < idxBEnd) ? *(idxB + 2) : 0;
+ indexStreamSize += u_isec2;
+
+ PX_ASSERT(indexStreamSize < MAX_INDEX_STREAM_SIZE);
+ tempIndexStream[indexStreamSize] = ((idxB + 3) < idxBEnd) ? *(idxB + 3) : 0;
+ indexStreamSize += u_isec3;
+
+ // flush interactions
+ if(MAX_INDEX_STREAM_SIZE - indexStreamSize >= (4 + 2))
+ ;
+ else // 4+2, since we potentially need to add the dst index + the src count as well.
+ {
+ *interactionCountPtr = indexStreamSize - indexStreamSizeOld;
+ if(isDensityMode)
+ {
+ if(twoWayUpdate)
+ updateStreamDensityTwoWay(particlesA, particlesB, tempIndexStream, indexStreamSize, params,
+ simdParams);
+ else
+ updateStreamDensity(particlesA, particlesB, tempIndexStream, indexStreamSize, params, simdParams);
+ }
+ else
+ {
+ if(twoWayUpdate)
+ updateStreamForceTwoWay(forceBufA, forceBufB, particlesA, particlesB, tempIndexStream,
+ indexStreamSize, params, simdParams);
+ else
+ updateStreamForce(forceBufA, particlesA, particlesB, tempIndexStream, indexStreamSize, params,
+ simdParams);
+ }
+
+ indexStreamSize = 0;
+ tempIndexStream[indexStreamSize++] = *idxA;
+ interactionCountPtr = tempIndexStream + indexStreamSize++;
+ indexStreamSizeOld = indexStreamSize;
+ }
+ }
+
+ *interactionCountPtr = indexStreamSize - indexStreamSizeOld;
+ }
+
+ if(indexStreamSize > 0)
+ {
+ if(isDensityMode)
+ {
+ if(twoWayUpdate)
+ updateStreamDensityTwoWay(particlesA, particlesB, tempIndexStream, indexStreamSize, params, simdParams);
+ else
+ updateStreamDensity(particlesA, particlesB, tempIndexStream, indexStreamSize, params, simdParams);
+ }
+ else
+ {
+ if(twoWayUpdate)
+ updateStreamForceTwoWay(forceBufA, forceBufB, particlesA, particlesB, tempIndexStream, indexStreamSize,
+ params, simdParams);
+ else
+ updateStreamForce(forceBufA, particlesA, particlesB, tempIndexStream, indexStreamSize, params,
+ simdParams);
+ }
+ }
+}
+
+#endif // !PX_IOS
+
+#define SIMD_THRESH_SRC 8
+
+/**
+Computes and adds contributions of particle group B to particle group A. If twoWayUpdate is true,
+group B is updated with contributions from group A as well.
+*/
+PX_FORCE_INLINE_KERNELS static void
+updateParticleGroupPair(PxVec3* __restrict forceBufA, PxVec3* __restrict forceBufB, Particle* __restrict particlesA,
+ Particle* __restrict particlesB, const PxU32* __restrict particleIndicesA,
+ const PxU32 numParticlesA, const PxU32* __restrict particleIndicesB, const PxU32 numParticlesB,
+ const bool twoWayUpdate, const bool isDensityMode, const DynamicsParameters& params,
+ PxU8* tempSimdPositionBuffer, PxU32* tempIndexStream)
+{
+ PX_ASSERT(numParticlesA > 0);
+ PX_ASSERT(numParticlesB > 0);
+
+#if !PX_IOS
+ if(numParticlesB < SIMD_THRESH_SRC)
+#endif
+ {
+ if(isDensityMode)
+ {
+ if(twoWayUpdate)
+ {
+ PX_ASSERT(forceBufB);
+ updateParticleGroupPair_small_template<DensityPassType, TwoWayUpdateType>(
+ forceBufA, forceBufB, particlesA, particlesB, particleIndicesA, numParticlesA, particleIndicesB,
+ numParticlesB, params);
+ }
+ else
+ {
+ updateParticleGroupPair_small_template<DensityPassType, OneWayUpdateType>(
+ forceBufA, forceBufB, particlesA, particlesB, particleIndicesA, numParticlesA, particleIndicesB,
+ numParticlesB, params);
+ }
+ }
+ else
+ {
+ if(twoWayUpdate)
+ {
+ PX_ASSERT(forceBufB);
+ updateParticleGroupPair_small_template<ForcePassType, TwoWayUpdateType>(
+ forceBufA, forceBufB, particlesA, particlesB, particleIndicesA, numParticlesA, particleIndicesB,
+ numParticlesB, params);
+ }
+ else
+ {
+ updateParticleGroupPair_small_template<ForcePassType, OneWayUpdateType>(
+ forceBufA, forceBufB, particlesA, particlesB, particleIndicesA, numParticlesA, particleIndicesB,
+ numParticlesB, params);
+ }
+ }
+ }
+#if !PX_IOS
+ else
+ {
+ updateParticleGroupPair_simd_template(forceBufA, forceBufB, particlesA, particlesB, particleIndicesA,
+ numParticlesA, particleIndicesB, numParticlesB, params, isDensityMode,
+ twoWayUpdate, reinterpret_cast<Vec4V*>(tempSimdPositionBuffer),
+ tempIndexStream);
+ }
+#else
+ PX_UNUSED(tempSimdPositionBuffer);
+ PX_UNUSED(tempIndexStream);
+#endif
+}
+
+#endif // REFERENCE_KERNELS
+
+} // namespace Pt
+} // namespace physx
+
+#endif // PX_USE_PARTICLE_SYSTEM_API
+#endif // PT_DYNAMICS_KERNELS_H