diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelCloth/src | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src')
66 files changed, 17131 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Allocator.cpp b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.cpp new file mode 100644 index 00000000..7a322ce9 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.cpp @@ -0,0 +1,46 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "Allocator.h" +#include "PsAlignedMalloc.h" + +namespace physx +{ + +void* cloth::allocate(size_t n) +{ + return n ? physx::shdfnd::getAllocator().allocate(n, "", __FILE__, __LINE__) : 0; +} + +void cloth::deallocate(void* ptr) +{ + if(ptr) + physx::shdfnd::getAllocator().deallocate(ptr); +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Allocator.h b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.h new file mode 100644 index 00000000..d99c1708 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/Allocator.h @@ -0,0 +1,74 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "PsArray.h" +#include "PsAlignedMalloc.h" + +namespace physx +{ +namespace cloth +{ + +void* allocate(size_t); +void deallocate(void*); + +/* templated typedefs for convenience */ + +template <typename T> +struct Vector +{ + typedef physx::shdfnd::Array<T, physx::shdfnd::NonTrackingAllocator> Type; +}; + +template <typename T, size_t alignment> +struct AlignedVector +{ + typedef physx::shdfnd::Array<T, physx::shdfnd::AlignedAllocator<alignment> > Type; +}; + +struct UserAllocated +{ + virtual ~UserAllocated() + { + } + static void* operator new(size_t n) + { + return allocate(n); + } + static void operator delete(void* ptr) + { + deallocate(ptr); + } +}; + +} // namespace cloth +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Array.h b/PhysX_3.4/Source/LowLevelCloth/src/Array.h new file mode 100644 index 00000000..75ba2f50 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/Array.h @@ -0,0 +1,69 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxVec4.h" +#include "foundation/PxQuat.h" +#include "foundation/PxVec3.h" + +namespace physx +{ + +namespace cloth +{ + +inline PxReal (&array(PxVec3& v))[3] +{ + return reinterpret_cast<PxReal(&)[3]>(v); +} +inline const PxReal (&array(const PxVec3& v))[3] +{ + return reinterpret_cast<const PxReal(&)[3]>(v); +} +inline PxReal (&array(PxVec4& v))[4] +{ + return reinterpret_cast<PxReal(&)[4]>(v); +} +inline const PxReal (&array(const PxVec4& v))[4] +{ + return reinterpret_cast<const PxReal(&)[4]>(v); +} +inline PxReal (&array(PxQuat& q))[4] +{ + return reinterpret_cast<PxReal(&)[4]>(q); +} +inline const PxReal (&array(const PxQuat& q))[4] +{ + return reinterpret_cast<const PxReal(&)[4]>(q); +} + +} // namespace cloth + +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/BoundingBox.h b/PhysX_3.4/Source/LowLevelCloth/src/BoundingBox.h new file mode 100644 index 00000000..bd33affa --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/BoundingBox.h @@ -0,0 +1,103 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Simd.h" + +namespace physx +{ + +namespace cloth +{ + +template <typename Simd4f> +struct BoundingBox +{ + Simd4f mLower; + Simd4f mUpper; +}; + +template <typename Simd4f> +inline BoundingBox<Simd4f> loadBounds(const float* ptr) +{ + BoundingBox<Simd4f> result; + result.mLower = load(ptr); + result.mUpper = load(ptr + 3); + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> emptyBounds() +{ + BoundingBox<Simd4f> result; + + result.mLower = gSimd4fFloatMax; + result.mUpper = -result.mLower; + + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd) +{ + BoundingBox<Simd4f> result = bounds; + for(; pIt != pEnd; ++pIt) + { + result.mLower = min(result.mLower, *pIt); + result.mUpper = max(result.mUpper, *pIt); + } + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b) +{ + BoundingBox<Simd4f> result; + result.mLower = min(a.mLower, b.mLower); + result.mUpper = max(a.mUpper, b.mUpper); + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b) +{ + BoundingBox<Simd4f> result; + result.mLower = max(a.mLower, b.mLower); + result.mUpper = min(a.mUpper, b.mUpper); + return result; +} + +template <typename Simd4f> +inline bool isEmptyBounds(const BoundingBox<Simd4f>& a) +{ + return anyGreater(a.mLower, a.mUpper) != 0; +} +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/ClothBase.h b/PhysX_3.4/Source/LowLevelCloth/src/ClothBase.h new file mode 100644 index 00000000..fef5090e --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/ClothBase.h @@ -0,0 +1,133 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "PsMathUtils.h" + +namespace physx +{ +namespace cloth +{ + +/* helper functions shared between SwCloth and CuCloth */ + +template <typename Cloth> +void initialize(Cloth& cloth, const PxVec4* pIt, const PxVec4* pEnd) +{ + // initialize particles bounding box + PxVec4 lower(FLT_MAX), upper = -lower; + for(; pIt != pEnd; ++pIt) + { + lower = lower.minimum(*pIt); + upper = upper.maximum(*pIt); + } + PxVec4 center = (upper + lower) * 0.5f; + PxVec4 extent = (upper - lower) * 0.5f; + cloth.mParticleBoundsCenter = reinterpret_cast<const PxVec3&>(center); + cloth.mParticleBoundsHalfExtent = reinterpret_cast<const PxVec3&>(extent); + + cloth.mGravity = PxVec3(0.0f); + cloth.mLogDamping = PxVec3(0.0f); + cloth.mLinearLogDrag = PxVec3(0.0f); + cloth.mAngularLogDrag = PxVec3(0.0f); + cloth.mLinearInertia = PxVec3(1.0f); + cloth.mAngularInertia = PxVec3(1.0f); + cloth.mCentrifugalInertia = PxVec3(1.0f); + cloth.mSolverFrequency = 60.0f; + cloth.mStiffnessFrequency = 10.0f; + cloth.mTargetMotion = PxTransform(PxIdentity); + cloth.mCurrentMotion = PxTransform(PxIdentity); + cloth.mLinearVelocity = PxVec3(0.0f); + cloth.mAngularVelocity = PxVec3(0.0f); + cloth.mPrevIterDt = 0.0f; + cloth.mIterDtAvg = MovingAverage(30); + cloth.mTetherConstraintLogStiffness = PxReal(-FLT_MAX_EXP); + cloth.mTetherConstraintScale = 1.0f; + cloth.mMotionConstraintScale = 1.0f; + cloth.mMotionConstraintBias = 0.0f; + cloth.mMotionConstraintLogStiffness = PxReal(-FLT_MAX_EXP); + cloth.mWind = PxVec3(0.0f); + cloth.mDragLogCoefficient = 0.0f; + cloth.mLiftLogCoefficient = 0.0f; + cloth.mEnableContinuousCollision = false; + cloth.mCollisionMassScale = 0.0f; + cloth.mFriction = 0.0f; + cloth.mSelfCollisionDistance = 0.0f; + cloth.mSelfCollisionLogStiffness = PxReal(-FLT_MAX_EXP); + cloth.mSleepTestInterval = uint32_t(-1); + cloth.mSleepAfterCount = uint32_t(-1); + cloth.mSleepThreshold = 0.0f; + cloth.mSleepPassCounter = 0; + cloth.mSleepTestCounter = 0; +} + +template <typename DstCloth, typename SrcCloth> +void copy(DstCloth& dstCloth, const SrcCloth& srcCloth) +{ + dstCloth.mParticleBoundsCenter = srcCloth.mParticleBoundsCenter; + dstCloth.mParticleBoundsHalfExtent = srcCloth.mParticleBoundsHalfExtent; + dstCloth.mGravity = srcCloth.mGravity; + dstCloth.mLogDamping = srcCloth.mLogDamping; + dstCloth.mLinearLogDrag = srcCloth.mLinearLogDrag; + dstCloth.mAngularLogDrag = srcCloth.mAngularLogDrag; + dstCloth.mLinearInertia = srcCloth.mLinearInertia; + dstCloth.mAngularInertia = srcCloth.mAngularInertia; + dstCloth.mCentrifugalInertia = srcCloth.mCentrifugalInertia; + dstCloth.mSolverFrequency = srcCloth.mSolverFrequency; + dstCloth.mStiffnessFrequency = srcCloth.mStiffnessFrequency; + dstCloth.mTargetMotion = srcCloth.mTargetMotion; + dstCloth.mCurrentMotion = srcCloth.mCurrentMotion; + dstCloth.mLinearVelocity = srcCloth.mLinearVelocity; + dstCloth.mAngularVelocity = srcCloth.mAngularVelocity; + dstCloth.mPrevIterDt = srcCloth.mPrevIterDt; + dstCloth.mIterDtAvg = srcCloth.mIterDtAvg; + dstCloth.mTetherConstraintLogStiffness = srcCloth.mTetherConstraintLogStiffness; + dstCloth.mTetherConstraintScale = srcCloth.mTetherConstraintScale; + dstCloth.mMotionConstraintScale = srcCloth.mMotionConstraintScale; + dstCloth.mMotionConstraintBias = srcCloth.mMotionConstraintBias; + dstCloth.mMotionConstraintLogStiffness = srcCloth.mMotionConstraintLogStiffness; + dstCloth.mWind = srcCloth.mWind; + dstCloth.mDragLogCoefficient = srcCloth.mDragLogCoefficient; + dstCloth.mLiftLogCoefficient = srcCloth.mLiftLogCoefficient; + dstCloth.mEnableContinuousCollision = srcCloth.mEnableContinuousCollision; + dstCloth.mCollisionMassScale = srcCloth.mCollisionMassScale; + dstCloth.mFriction = srcCloth.mFriction; + dstCloth.mSelfCollisionDistance = srcCloth.mSelfCollisionDistance; + dstCloth.mSelfCollisionLogStiffness = srcCloth.mSelfCollisionLogStiffness; + dstCloth.mSleepTestInterval = srcCloth.mSleepTestInterval; + dstCloth.mSleepAfterCount = srcCloth.mSleepAfterCount; + dstCloth.mSleepThreshold = srcCloth.mSleepThreshold; + dstCloth.mSleepPassCounter = srcCloth.mSleepPassCounter; + dstCloth.mSleepTestCounter = srcCloth.mSleepTestCounter; + dstCloth.mUserData = srcCloth.mUserData; +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/ClothImpl.h b/PhysX_3.4/Source/LowLevelCloth/src/ClothImpl.h new file mode 100644 index 00000000..2cc491c5 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/ClothImpl.h @@ -0,0 +1,1302 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Cloth.h" +#include "Fabric.h" +#include "Allocator.h" +#include "PsMathUtils.h" + +namespace physx +{ +namespace cloth +{ + +// SwCloth or CuCloth aggregate implementing the Cloth interface +// Member specializations are implemented in Sw/CuCloth.cpp +template <typename T> +class ClothImpl : public UserAllocated, public Cloth +{ + ClothImpl(const ClothImpl&); + + public: + ClothImpl& operator=(const ClothImpl&); + + typedef T ClothType; + typedef typename ClothType::FactoryType FactoryType; + typedef typename ClothType::FabricType FabricType; + typedef typename ClothType::ContextLockType ContextLockType; + + ClothImpl(Factory&, Fabric&, Range<const PxVec4>); + ClothImpl(Factory&, const ClothImpl&); + + virtual Cloth* clone(Factory& factory) const; + + virtual Fabric& getFabric() const; + virtual Factory& getFactory() const; + + virtual uint32_t getNumParticles() const; + virtual void lockParticles() const; + virtual void unlockParticles() const; + virtual MappedRange<PxVec4> getCurrentParticles(); + virtual MappedRange<const PxVec4> getCurrentParticles() const; + virtual MappedRange<PxVec4> getPreviousParticles(); + virtual MappedRange<const PxVec4> getPreviousParticles() const; + virtual GpuParticles getGpuParticles(); + + virtual void setTranslation(const PxVec3& trans); + virtual void setRotation(const PxQuat& rot); + + virtual const PxVec3& getTranslation() const; + virtual const PxQuat& getRotation() const; + + virtual void clearInertia(); + + virtual void teleport(const PxVec3& delta); + + virtual float getPreviousIterationDt() const; + virtual void setGravity(const PxVec3& gravity); + virtual PxVec3 getGravity() const; + virtual void setDamping(const PxVec3& damping); + virtual PxVec3 getDamping() const; + virtual void setLinearDrag(const PxVec3& drag); + virtual PxVec3 getLinearDrag() const; + virtual void setAngularDrag(const PxVec3& drag); + virtual PxVec3 getAngularDrag() const; + virtual void setLinearInertia(const PxVec3& inertia); + virtual PxVec3 getLinearInertia() const; + virtual void setAngularInertia(const PxVec3& inertia); + virtual PxVec3 getAngularInertia() const; + virtual void setCentrifugalInertia(const PxVec3& inertia); + virtual PxVec3 getCentrifugalInertia() const; + + virtual void setSolverFrequency(float frequency); + virtual float getSolverFrequency() const; + + virtual void setStiffnessFrequency(float frequency); + virtual float getStiffnessFrequency() const; + + virtual void setAcceleationFilterWidth(uint32_t); + virtual uint32_t getAccelerationFilterWidth() const; + + virtual void setPhaseConfig(Range<const PhaseConfig> configs); + + virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last); + virtual uint32_t getNumSpheres() const; + + virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last); + virtual uint32_t getNumCapsules() const; + + virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last); + virtual uint32_t getNumPlanes() const; + + virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last); + virtual uint32_t getNumConvexes() const; + + virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last); + virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first); + virtual uint32_t getNumTriangles() const; + + virtual bool isContinuousCollisionEnabled() const; + virtual void enableContinuousCollision(bool); + + virtual float getCollisionMassScale() const; + virtual void setCollisionMassScale(float); + virtual void setFriction(float friction); + virtual float getFriction() const; + + virtual void setVirtualParticles(Range<const uint32_t[4]>, Range<const PxVec3>); + virtual uint32_t getNumVirtualParticles() const; + virtual uint32_t getNumVirtualParticleWeights() const; + + virtual void setTetherConstraintScale(float scale); + virtual float getTetherConstraintScale() const; + virtual void setTetherConstraintStiffness(float stiffness); + virtual float getTetherConstraintStiffness() const; + + virtual Range<PxVec4> getMotionConstraints(); + virtual void clearMotionConstraints(); + virtual uint32_t getNumMotionConstraints() const; + virtual void setMotionConstraintScaleBias(float scale, float bias); + virtual float getMotionConstraintScale() const; + virtual float getMotionConstraintBias() const; + virtual void setMotionConstraintStiffness(float stiffness); + virtual float getMotionConstraintStiffness() const; + + virtual Range<PxVec4> getSeparationConstraints(); + virtual void clearSeparationConstraints(); + virtual uint32_t getNumSeparationConstraints() const; + + virtual void clearInterpolation(); + + virtual Range<PxVec4> getParticleAccelerations(); + virtual void clearParticleAccelerations(); + virtual uint32_t getNumParticleAccelerations() const; + + virtual void setWindVelocity(PxVec3); + virtual PxVec3 getWindVelocity() const; + virtual void setDragCoefficient(float); + virtual float getDragCoefficient() const; + virtual void setLiftCoefficient(float); + virtual float getLiftCoefficient() const; + + virtual void setSelfCollisionDistance(float); + virtual float getSelfCollisionDistance() const; + virtual void setSelfCollisionStiffness(float); + virtual float getSelfCollisionStiffness() const; + + virtual void setSelfCollisionIndices(Range<const uint32_t>); + virtual uint32_t getNumSelfCollisionIndices() const; + + virtual void setRestPositions(Range<const PxVec4>); + virtual uint32_t getNumRestPositions() const; + + virtual const PxVec3& getBoundingBoxCenter() const; + virtual const PxVec3& getBoundingBoxScale() const; + + virtual void setSleepThreshold(float); + virtual float getSleepThreshold() const; + virtual void setSleepTestInterval(uint32_t); + virtual uint32_t getSleepTestInterval() const; + virtual void setSleepAfterCount(uint32_t); + virtual uint32_t getSleepAfterCount() const; + virtual uint32_t getSleepPassCount() const; + virtual bool isAsleep() const; + virtual void putToSleep(); + virtual void wakeUp(); + + virtual void setUserData(void*); + virtual void* getUserData() const; + + // helper function + template <typename U> + MappedRange<U> getMappedParticles(U* data) const; + + ClothType mCloth; +}; + +class SwCloth; +typedef ClothImpl<SwCloth> SwClothImpl; + +class CuCloth; +typedef ClothImpl<CuCloth> CuClothImpl; + +class DxCloth; +typedef ClothImpl<DxCloth> DxClothImpl; + +template <typename T> +ClothImpl<T>::ClothImpl(Factory& factory, Fabric& fabric, Range<const PxVec4> particles) +: mCloth(static_cast<FactoryType&>(factory), static_cast<FabricType&>(fabric), particles) +{ + // fabric and cloth need to be created by the same factory + PX_ASSERT(&fabric.getFactory() == &factory); +} + +template <typename T> +ClothImpl<T>::ClothImpl(Factory& factory, const ClothImpl& impl) +: mCloth(static_cast<FactoryType&>(factory), impl.mCloth) +{ +} + +template <typename T> +inline Fabric& ClothImpl<T>::getFabric() const +{ + return mCloth.mFabric; +} + +template <typename T> +inline Factory& ClothImpl<T>::getFactory() const +{ + return mCloth.mFactory; +} + +template <typename T> +inline void ClothImpl<T>::setTranslation(const PxVec3& trans) +{ + PxVec3 t = reinterpret_cast<const PxVec3&>(trans); + if(t == mCloth.mTargetMotion.p) + return; + + mCloth.mTargetMotion.p = t; + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::setRotation(const PxQuat& q) +{ + if((q - mCloth.mTargetMotion.q).magnitudeSquared() == 0.0f) + return; + + mCloth.mTargetMotion.q = q; + mCloth.wakeUp(); +} + +template <typename T> +inline const PxVec3& ClothImpl<T>::getTranslation() const +{ + return mCloth.mTargetMotion.p; +} + +template <typename T> +inline const PxQuat& ClothImpl<T>::getRotation() const +{ + return mCloth.mTargetMotion.q; +} + +template <typename T> +inline void ClothImpl<T>::clearInertia() +{ + mCloth.mCurrentMotion = mCloth.mTargetMotion; + mCloth.mLinearVelocity = PxVec3(0.0f); + mCloth.mAngularVelocity = PxVec3(0.0f); + + mCloth.wakeUp(); +} + +// Fixed 4505:local function has been removed +template <typename T> +inline void ClothImpl<T>::teleport(const PxVec3& delta) +{ + mCloth.mCurrentMotion.p += delta; + mCloth.mTargetMotion.p += delta; +} + +template <typename T> +inline float ClothImpl<T>::getPreviousIterationDt() const +{ + return mCloth.mPrevIterDt; +} + +template <typename T> +inline void ClothImpl<T>::setGravity(const PxVec3& gravity) +{ + PxVec3 value = gravity; + if(value == mCloth.mGravity) + return; + + mCloth.mGravity = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getGravity() const +{ + return mCloth.mGravity; +} + +inline float safeLog2(float x) +{ + return x ? shdfnd::log2(x) : -FLT_MAX_EXP; +} + +inline PxVec3 safeLog2(const PxVec3& v) +{ + return PxVec3(safeLog2(v.x), safeLog2(v.y), safeLog2(v.z)); +} + +inline float safeExp2(float x) +{ + if(x <= -FLT_MAX_EXP) + return 0.0f; + else + return shdfnd::exp2(x); +} + +inline PxVec3 safeExp2(const PxVec3& v) +{ + return PxVec3(safeExp2(v.x), safeExp2(v.y), safeExp2(v.z)); +} + +template <typename T> +inline void ClothImpl<T>::setDamping(const PxVec3& damping) +{ + PxVec3 value = safeLog2(PxVec3(1.f) - damping); + if(value == mCloth.mLogDamping) + return; + + mCloth.mLogDamping = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getDamping() const +{ + return PxVec3(1.f) - safeExp2(mCloth.mLogDamping); +} + +template <typename T> +inline void ClothImpl<T>::setLinearDrag(const PxVec3& drag) +{ + PxVec3 value = safeLog2(PxVec3(1.f) - drag); + if(value == mCloth.mLinearLogDrag) + return; + + mCloth.mLinearLogDrag = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getLinearDrag() const +{ + return PxVec3(1.f) - safeExp2(mCloth.mLinearLogDrag); +} + +template <typename T> +inline void ClothImpl<T>::setAngularDrag(const PxVec3& drag) +{ + PxVec3 value = safeLog2(PxVec3(1.f) - drag); + if(value == mCloth.mAngularLogDrag) + return; + + mCloth.mAngularLogDrag = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getAngularDrag() const +{ + return PxVec3(1.f) - safeExp2(mCloth.mAngularLogDrag); +} + +template <typename T> +inline void ClothImpl<T>::setLinearInertia(const PxVec3& inertia) +{ + PxVec3 value = inertia; + if(value == mCloth.mLinearInertia) + return; + + mCloth.mLinearInertia = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getLinearInertia() const +{ + return mCloth.mLinearInertia; +} + +template <typename T> +inline void ClothImpl<T>::setAngularInertia(const PxVec3& inertia) +{ + PxVec3 value = inertia; + if(value == mCloth.mAngularInertia) + return; + + mCloth.mAngularInertia = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getAngularInertia() const +{ + return mCloth.mAngularInertia; +} + +template <typename T> +inline void ClothImpl<T>::setCentrifugalInertia(const PxVec3& inertia) +{ + PxVec3 value = inertia; + if(value == mCloth.mCentrifugalInertia) + return; + + mCloth.mCentrifugalInertia = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getCentrifugalInertia() const +{ + return mCloth.mCentrifugalInertia; +} + +template <typename T> +inline void ClothImpl<T>::setSolverFrequency(float frequency) +{ + if(frequency == mCloth.mSolverFrequency) + return; + + mCloth.mSolverFrequency = frequency; + mCloth.mIterDtAvg.reset(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSolverFrequency() const +{ + return mCloth.mSolverFrequency; +} + +template <typename T> +inline void ClothImpl<T>::setStiffnessFrequency(float frequency) +{ + if(frequency == mCloth.mStiffnessFrequency) + return; + + mCloth.mStiffnessFrequency = frequency; + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getStiffnessFrequency() const +{ + return mCloth.mStiffnessFrequency; +} + +template <typename T> +inline void ClothImpl<T>::setAcceleationFilterWidth(uint32_t n) +{ + mCloth.mIterDtAvg.resize(n); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getAccelerationFilterWidth() const +{ + return mCloth.mIterDtAvg.size(); +} + +// move a subarray +template <typename Iter> +void move(Iter it, uint32_t first, uint32_t last, uint32_t result) +{ + if(result > first) + { + result += last - first; + while(first < last) + it[--result] = it[--last]; + } + else + { + while(first < last) + it[result++] = it[first++]; + } +} + +// update capsule index +inline bool updateIndex(uint32_t& index, uint32_t first, int32_t delta) +{ + return index >= first && int32_t(index += delta) < int32_t(first); +} + +template <typename T> +inline void ClothImpl<T>::setSpheres(Range<const PxVec4> spheres, uint32_t first, uint32_t last) +{ + uint32_t oldSize = uint32_t(mCloth.mStartCollisionSpheres.size()); + uint32_t newSize = uint32_t(spheres.size()) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + +#if PX_DEBUG + for(const PxVec4* it = spheres.begin(); it < spheres.end(); ++it) + PX_ASSERT(it->w >= 0.0f); +#endif + + if(!oldSize && !newSize) + return; + + if(!oldSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionSpheres.assign(spheres.begin(), spheres.end()); + mCloth.notifyChanged(); + } + else + { + if(PxMax(oldSize, newSize) > + PxMin(mCloth.mStartCollisionSpheres.capacity(), mCloth.mTargetCollisionSpheres.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionSpheres.reserve(newSize); + mCloth.mTargetCollisionSpheres.reserve(PxMax(oldSize, newSize)); + } + + typename T::MappedVec4fVectorType start = mCloth.mStartCollisionSpheres; + typename T::MappedVec4fVectorType target = mCloth.mTargetCollisionSpheres; + + // fill target from start + for(uint32_t i = target.size(); i < oldSize; ++i) + target.pushBack(start[i]); + + // resize to larger of oldSize and newSize + start.resize(PxMax(oldSize, newSize), PxVec4(0.0f)); + target.resize(PxMax(oldSize, newSize), PxVec4(0.0f)); + + if(int32_t delta = int32_t(newSize - oldSize)) + { + // move past-range elements to new place + move(start.begin(), last, oldSize, last + delta); + move(target.begin(), last, oldSize, last + delta); + + // fill new elements from spheres + for(uint32_t i = last; i < last + delta; ++i) + start[i] = spheres[i - first]; + + // adjust capsule indices + typename T::MappedIndexVectorType indices = mCloth.mCapsuleIndices; + Vector<IndexPair>::Type::Iterator cIt, cEnd = indices.end(); + for(cIt = indices.begin(); cIt != cEnd;) + { + bool removed = false; + removed |= updateIndex(cIt->first, last + PxMin(0, delta), int32_t(delta)); + removed |= updateIndex(cIt->second, last + PxMin(0, delta), int32_t(delta)); + if(!removed) + ++cIt; + else + { + indices.replaceWithLast(cIt); + cEnd = indices.end(); + } + } + + start.resize(newSize); + target.resize(newSize); + + mCloth.notifyChanged(); + } + + // fill target elements with spheres + for(uint32_t i = 0; i < spheres.size(); ++i) + target[first + i] = spheres[i]; + } + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumSpheres() const +{ + return uint32_t(mCloth.mStartCollisionSpheres.size()); +} + +// Fixed 4505:local function has been removed +template <typename T> +inline void ClothImpl<T>::setCapsules(Range<const uint32_t> capsules, uint32_t first, uint32_t last) +{ + uint32_t oldSize = mCloth.mCapsuleIndices.size(); + uint32_t newSize = uint32_t(capsules.size() / 2) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + const IndexPair* srcIndices = reinterpret_cast<const IndexPair*>(capsules.begin()); + + if(mCloth.mCapsuleIndices.capacity() < newSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mCapsuleIndices.reserve(newSize); + } + + // resize to larger of oldSize and newSize + mCloth.mCapsuleIndices.resize(PxMax(oldSize, newSize)); + + typename T::MappedIndexVectorType dstIndices = mCloth.mCapsuleIndices; + + if(uint32_t delta = newSize - oldSize) + { + // move past-range elements to new place + move(dstIndices.begin(), last, oldSize, last + delta); + + // fill new elements from capsules + for(uint32_t i = last; i < last + delta; ++i) + dstIndices[i] = srcIndices[i - first]; + + dstIndices.resize(newSize); + mCloth.notifyChanged(); + } + + // fill existing elements from capsules + for(uint32_t i = first; i < last; ++i) + dstIndices[i] = srcIndices[i - first]; + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumCapsules() const +{ + return uint32_t(mCloth.mCapsuleIndices.size()); +} + +template <typename T> +inline void ClothImpl<T>::setPlanes(Range<const PxVec4> planes, uint32_t first, uint32_t last) +{ + uint32_t oldSize = uint32_t(mCloth.mStartCollisionPlanes.size()); + uint32_t newSize = uint32_t(planes.size()) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + if(!oldSize && !newSize) + return; + + if(!oldSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionPlanes.assign(planes.begin(), planes.end()); + mCloth.notifyChanged(); + } + else + { + if(PxMax(oldSize, newSize) > + PxMin(mCloth.mStartCollisionPlanes.capacity(), mCloth.mTargetCollisionPlanes.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionPlanes.reserve(newSize); + mCloth.mTargetCollisionPlanes.reserve(PxMax(oldSize, newSize)); + } + + // fill target from start + for(uint32_t i = mCloth.mTargetCollisionPlanes.size(); i < oldSize; ++i) + mCloth.mTargetCollisionPlanes.pushBack(mCloth.mStartCollisionPlanes[i]); + + // resize to larger of oldSize and newSize + mCloth.mStartCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero); + mCloth.mTargetCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero); + + if(int32_t delta = int32_t(newSize - oldSize)) + { + // move past-range elements to new place + move(mCloth.mStartCollisionPlanes.begin(), last, oldSize, last + delta); + move(mCloth.mTargetCollisionPlanes.begin(), last, oldSize, last + delta); + + // fill new elements from planes + for(uint32_t i = last; i < last + delta; ++i) + mCloth.mStartCollisionPlanes[i] = planes[i - first]; + + // adjust convex indices + uint32_t mask = (uint32_t(1) << (last + PxMin(delta, 0))) - 1; + Vector<uint32_t>::Type::Iterator cIt, cEnd = mCloth.mConvexMasks.end(); + for(cIt = mCloth.mConvexMasks.begin(); cIt != cEnd;) + { + uint32_t convex = (*cIt & mask); + if(delta < 0) + convex |= *cIt >> -delta & ~mask; + else + convex |= (*cIt & ~mask) << delta; + if(convex) + *cIt++ = convex; + else + { + mCloth.mConvexMasks.replaceWithLast(cIt); + cEnd = mCloth.mConvexMasks.end(); + } + } + + mCloth.mStartCollisionPlanes.resize(newSize); + mCloth.mTargetCollisionPlanes.resize(newSize); + + mCloth.notifyChanged(); + } + + // fill target elements with planes + for(uint32_t i = 0; i < planes.size(); ++i) + mCloth.mTargetCollisionPlanes[first + i] = planes[i]; + } + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumPlanes() const +{ + return uint32_t(mCloth.mStartCollisionPlanes.size()); +} + +template <typename T> +inline void ClothImpl<T>::setConvexes(Range<const uint32_t> convexes, uint32_t first, uint32_t last) +{ + uint32_t oldSize = mCloth.mConvexMasks.size(); + uint32_t newSize = uint32_t(convexes.size()) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + if(mCloth.mConvexMasks.capacity() < newSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mConvexMasks.reserve(newSize); + } + + // resize to larger of oldSize and newSize + mCloth.mConvexMasks.resize(PxMax(oldSize, newSize)); + + if(uint32_t delta = newSize - oldSize) + { + // move past-range elements to new place + move(mCloth.mConvexMasks.begin(), last, oldSize, last + delta); + + // fill new elements from capsules + for(uint32_t i = last; i < last + delta; ++i) + mCloth.mConvexMasks[i] = convexes[i - first]; + + mCloth.mConvexMasks.resize(newSize); + mCloth.notifyChanged(); + } + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumConvexes() const +{ + return uint32_t(mCloth.mConvexMasks.size()); +} + +template <typename T> +inline void ClothImpl<T>::setTriangles(Range<const PxVec3> triangles, uint32_t first, uint32_t last) +{ + // convert from triangle to vertex count + first *= 3; + last *= 3; + + triangles = mCloth.clampTriangleCount(triangles, last - first); + PX_ASSERT(0 == triangles.size() % 3); + + uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size()); + uint32_t newSize = uint32_t(triangles.size()) + oldSize - last + first; + + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + if(!oldSize && !newSize) + return; + + if(!oldSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionTriangles.assign(triangles.begin(), triangles.end()); + mCloth.notifyChanged(); + } + else + { + if(PxMax(oldSize, newSize) > + PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionTriangles.reserve(newSize); + mCloth.mTargetCollisionTriangles.reserve(PxMax(oldSize, newSize)); + } + + // fill target from start + for(uint32_t i = mCloth.mTargetCollisionTriangles.size(); i < oldSize; ++i) + mCloth.mTargetCollisionTriangles.pushBack(mCloth.mStartCollisionTriangles[i]); + + // resize to larger of oldSize and newSize + mCloth.mStartCollisionTriangles.resize(PxMax(oldSize, newSize)); + mCloth.mTargetCollisionTriangles.resize(PxMax(oldSize, newSize)); + + if(uint32_t delta = newSize - oldSize) + { + // move past-range elements to new place + move(mCloth.mStartCollisionTriangles.begin(), last, oldSize, last + delta); + move(mCloth.mTargetCollisionTriangles.begin(), last, oldSize, last + delta); + + // fill new elements from triangles + for(uint32_t i = last; i < last + delta; ++i) + mCloth.mStartCollisionTriangles[i] = triangles[i - first]; + + mCloth.mStartCollisionTriangles.resize(newSize); + mCloth.mTargetCollisionTriangles.resize(newSize); + + mCloth.notifyChanged(); + } + + // fill target elements with triangles + for(uint32_t i = 0; i < triangles.size(); ++i) + mCloth.mTargetCollisionTriangles[first + i] = triangles[i]; + } + + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::setTriangles(Range<const PxVec3> startTriangles, Range<const PxVec3> targetTriangles, + uint32_t first) +{ + PX_ASSERT(startTriangles.size() == targetTriangles.size()); + + // convert from triangle to vertex count + first *= 3; + + uint32_t last = uint32_t(mCloth.mStartCollisionTriangles.size()); + + startTriangles = mCloth.clampTriangleCount(startTriangles, last - first); + targetTriangles = mCloth.clampTriangleCount(targetTriangles, last - first); + + uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size()); + uint32_t newSize = uint32_t(startTriangles.size()) + oldSize - last + first; + + PX_ASSERT(first <= oldSize); + PX_ASSERT(last == oldSize); // this path only supports replacing the tail + + if(!oldSize && !newSize) + return; + + if(newSize > PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionTriangles.reserve(newSize); + mCloth.mTargetCollisionTriangles.reserve(newSize); + } + + uint32_t retainSize = oldSize - last + first; + mCloth.mStartCollisionTriangles.resize(retainSize); + mCloth.mTargetCollisionTriangles.resize(retainSize); + + for(uint32_t i = 0, n = startTriangles.size(); i < n; ++i) + { + mCloth.mStartCollisionTriangles.pushBack(startTriangles[i]); + mCloth.mTargetCollisionTriangles.pushBack(targetTriangles[i]); + } + + if(newSize - oldSize) + mCloth.notifyChanged(); + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumTriangles() const +{ + return uint32_t(mCloth.mStartCollisionTriangles.size()) / 3; +} + +template <typename T> +inline bool ClothImpl<T>::isContinuousCollisionEnabled() const +{ + return mCloth.mEnableContinuousCollision; +} + +template <typename T> +inline void ClothImpl<T>::enableContinuousCollision(bool enable) +{ + if(enable == mCloth.mEnableContinuousCollision) + return; + + mCloth.mEnableContinuousCollision = enable; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getCollisionMassScale() const +{ + return mCloth.mCollisionMassScale; +} + +template <typename T> +inline void ClothImpl<T>::setCollisionMassScale(float scale) +{ + if(scale == mCloth.mCollisionMassScale) + return; + + mCloth.mCollisionMassScale = scale; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::setFriction(float friction) +{ + mCloth.mFriction = friction; + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getFriction() const +{ + return mCloth.mFriction; +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumVirtualParticleWeights() const +{ + return uint32_t(mCloth.mVirtualParticleWeights.size()); +} + +template <typename T> +inline void ClothImpl<T>::setTetherConstraintScale(float scale) +{ + if(scale == mCloth.mTetherConstraintScale) + return; + + mCloth.mTetherConstraintScale = scale; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getTetherConstraintScale() const +{ + return mCloth.mTetherConstraintScale; +} + +template <typename T> +inline void ClothImpl<T>::setTetherConstraintStiffness(float stiffness) +{ + float value = safeLog2(1 - stiffness); + if(value == mCloth.mTetherConstraintLogStiffness) + return; + + mCloth.mTetherConstraintLogStiffness = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getTetherConstraintStiffness() const +{ + return 1 - safeExp2(mCloth.mTetherConstraintLogStiffness); +} + +template <typename T> +inline Range<PxVec4> ClothImpl<T>::getMotionConstraints() +{ + mCloth.wakeUp(); + return mCloth.push(mCloth.mMotionConstraints); +} + +template <typename T> +inline void ClothImpl<T>::clearMotionConstraints() +{ + mCloth.clear(mCloth.mMotionConstraints); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumMotionConstraints() const +{ + return uint32_t(mCloth.mMotionConstraints.mStart.size()); +} + +template <typename T> +inline void ClothImpl<T>::setMotionConstraintScaleBias(float scale, float bias) +{ + if(scale == mCloth.mMotionConstraintScale && bias == mCloth.mMotionConstraintBias) + return; + + mCloth.mMotionConstraintScale = scale; + mCloth.mMotionConstraintBias = bias; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getMotionConstraintScale() const +{ + return mCloth.mMotionConstraintScale; +} + +template <typename T> +inline float ClothImpl<T>::getMotionConstraintBias() const +{ + return mCloth.mMotionConstraintBias; +} + +template <typename T> +inline void ClothImpl<T>::setMotionConstraintStiffness(float stiffness) +{ + float value = safeLog2(1 - stiffness); + if(value == mCloth.mMotionConstraintLogStiffness) + return; + + mCloth.mMotionConstraintLogStiffness = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getMotionConstraintStiffness() const +{ + return 1 - safeExp2(mCloth.mMotionConstraintLogStiffness); +} + +template <typename T> +inline Range<PxVec4> ClothImpl<T>::getSeparationConstraints() +{ + mCloth.wakeUp(); + return mCloth.push(mCloth.mSeparationConstraints); +} + +template <typename T> +inline void ClothImpl<T>::clearSeparationConstraints() +{ + mCloth.clear(mCloth.mSeparationConstraints); + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::clearInterpolation() +{ + if(!mCloth.mTargetCollisionSpheres.empty()) + { + physx::shdfnd::swap(mCloth.mStartCollisionSpheres, mCloth.mTargetCollisionSpheres); + mCloth.mTargetCollisionSpheres.resize(0); + } + mCloth.mMotionConstraints.pop(); + mCloth.mSeparationConstraints.pop(); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumSeparationConstraints() const +{ + return uint32_t(mCloth.mSeparationConstraints.mStart.size()); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumParticleAccelerations() const +{ + return uint32_t(mCloth.mParticleAccelerations.size()); +} + +template <typename T> +inline void ClothImpl<T>::setWindVelocity(PxVec3 wind) +{ + if(wind == mCloth.mWind) + return; + + mCloth.mWind = wind; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getWindVelocity() const +{ + return mCloth.mWind; +} + +template <typename T> +inline void ClothImpl<T>::setDragCoefficient(float coefficient) +{ + float value = safeLog2(1 - coefficient); + if(value == mCloth.mDragLogCoefficient) + return; + + mCloth.mDragLogCoefficient = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getDragCoefficient() const +{ + return 1 - safeExp2(mCloth.mDragLogCoefficient); +} + +template <typename T> +inline void ClothImpl<T>::setLiftCoefficient(float coefficient) +{ + float value = safeLog2(1 - coefficient); + if(value == mCloth.mLiftLogCoefficient) + return; + + mCloth.mLiftLogCoefficient = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getLiftCoefficient() const +{ + return 1 - safeExp2(mCloth.mLiftLogCoefficient); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumSelfCollisionIndices() const +{ + return uint32_t(mCloth.mSelfCollisionIndices.size()); +} + +// Fixed 4505:local function has been removed +template <typename T> +inline void ClothImpl<T>::setRestPositions(Range<const PxVec4> restPositions) +{ + PX_ASSERT(restPositions.empty() || restPositions.size() == getNumParticles()); + ContextLockType contextLock(mCloth.mFactory); + mCloth.mRestPositions.assign(restPositions.begin(), restPositions.end()); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumRestPositions() const +{ + return uint32_t(mCloth.mRestPositions.size()); +} + +template <typename T> +inline void ClothImpl<T>::setSelfCollisionDistance(float distance) +{ + if(distance == mCloth.mSelfCollisionDistance) + return; + + mCloth.mSelfCollisionDistance = distance; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSelfCollisionDistance() const +{ + return mCloth.mSelfCollisionDistance; +} + +template <typename T> +inline void ClothImpl<T>::setSelfCollisionStiffness(float stiffness) +{ + float value = safeLog2(1 - stiffness); + if(value == mCloth.mSelfCollisionLogStiffness) + return; + + mCloth.mSelfCollisionLogStiffness = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSelfCollisionStiffness() const +{ + return 1 - safeExp2(mCloth.mSelfCollisionLogStiffness); +} + +template <typename T> +inline const PxVec3& ClothImpl<T>::getBoundingBoxCenter() const +{ + return mCloth.mParticleBoundsCenter; +} + +template <typename T> +inline const PxVec3& ClothImpl<T>::getBoundingBoxScale() const +{ + return mCloth.mParticleBoundsHalfExtent; +} + +template <typename T> +inline void ClothImpl<T>::setSleepThreshold(float threshold) +{ + if(threshold == mCloth.mSleepThreshold) + return; + + mCloth.mSleepThreshold = threshold; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSleepThreshold() const +{ + return mCloth.mSleepThreshold; +} + +template <typename T> +inline void ClothImpl<T>::setSleepTestInterval(uint32_t interval) +{ + if(interval == mCloth.mSleepTestInterval) + return; + + mCloth.mSleepTestInterval = interval; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getSleepTestInterval() const +{ + return mCloth.mSleepTestInterval; +} + +template <typename T> +inline void ClothImpl<T>::setSleepAfterCount(uint32_t afterCount) +{ + if(afterCount == mCloth.mSleepAfterCount) + return; + + mCloth.mSleepAfterCount = afterCount; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getSleepAfterCount() const +{ + return mCloth.mSleepAfterCount; +} + +template <typename T> +inline uint32_t ClothImpl<T>::getSleepPassCount() const +{ + return mCloth.mSleepPassCounter; +} + +template <typename T> +inline bool ClothImpl<T>::isAsleep() const +{ + return mCloth.isSleeping(); +} + +template <typename T> +inline void ClothImpl<T>::putToSleep() +{ + mCloth.mSleepPassCounter = mCloth.mSleepAfterCount; +} + +template <typename T> +inline void ClothImpl<T>::wakeUp() +{ + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::setUserData(void* data) +{ + mCloth.mUserData = data; +} + +template <typename T> +inline void* ClothImpl<T>::getUserData() const +{ + return mCloth.mUserData; +} + +template <typename T> +template <typename U> +inline MappedRange<U> ClothImpl<T>::getMappedParticles(U* data) const +{ + return MappedRange<U>(data, data + getNumParticles(), *this, &Cloth::lockParticles, &Cloth::unlockParticles); +} + +} // namespace cloth + +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Factory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/Factory.cpp new file mode 100644 index 00000000..834093fa --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/Factory.cpp @@ -0,0 +1,71 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwFactory.h" +#include "PxPhysXConfig.h" + +// Factory.cpp gets included in both PhysXGPU and LowLevelCloth projects +// CuFactory can only be created in PhysXGPU project +#if defined(PX_PHYSX_GPU_EXPORTS) || PX_XBOXONE +#define ENABLE_CUFACTORY PX_SUPPORT_GPU_PHYSX +#else +#define ENABLE_CUFACTORY 0 +#endif + +#if ENABLE_CUFACTORY +#include "CuFactory.h" +#endif + +namespace physx +{ +namespace cloth +{ +uint32_t getNextFabricId() +{ + static uint32_t sNextFabricId = 0; + return sNextFabricId++; +} +} +} + +using namespace physx; + +cloth::Factory* cloth::Factory::createFactory(Platform platform, void* contextManager) +{ + PX_UNUSED(contextManager); + + if(platform == Factory::CPU) + return new SwFactory; + +#if ENABLE_CUFACTORY + if(platform == Factory::CUDA) + return new CuFactory((physx::PxCudaContextManager*)contextManager); +#endif + return 0; +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/IndexPair.h b/PhysX_3.4/Source/LowLevelCloth/src/IndexPair.h new file mode 100644 index 00000000..78f153b1 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/IndexPair.h @@ -0,0 +1,46 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ +namespace cloth +{ + +struct IndexPair +{ + uint32_t first; + uint32_t second; +}; + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/IterationState.h b/PhysX_3.4/Source/LowLevelCloth/src/IterationState.h new file mode 100644 index 00000000..c9ad9293 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/IterationState.h @@ -0,0 +1,403 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxTransform.h" +#include "foundation/PxMat44.h" +#include "Types.h" +#include "Array.h" +#include "Simd.h" +#include "PsMathUtils.h" + +namespace physx +{ + +/* function object to perform solver iterations on one cloth */ + +// todo: performance optimization: cache this object and test if velocity/iterDt has changed +// c'tor takes about 5% of the iteration time of a 20x20 cloth + +namespace cloth +{ + +/* helper functions */ + +template <typename T> +T sqr(const T& x) +{ + return x * x; +} + +inline PxVec3 log(const PxQuat& q) +{ + float theta = q.getImaginaryPart().magnitude(); + float scale = theta > PX_EPS_REAL ? PxAsin(theta) / theta : 1.0f; + scale = intrinsics::fsel(q.w, scale, -scale); + return PxVec3(q.x * scale, q.y * scale, q.z * scale); +} + +inline PxQuat exp(const PxVec3& v) +{ + float theta = v.magnitude(); + float scale = theta > PX_EPS_REAL ? PxSin(theta) / theta : 1.0f; + return PxQuat(v.x * scale, v.y * scale, v.z * scale, PxCos(theta)); +} + +template <typename Simd4f, uint32_t N> +inline void assign(Simd4f (&columns)[N], const PxMat44& matrix) +{ + for(uint32_t i = 0; i < N; ++i) + columns[i] = load(array(matrix[i])); +} + +template <typename Simd4f> +inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec) +{ + return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2]; +} + +template <typename Simd4f> +inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec) +{ + return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2]; +} + +template <typename> +struct IterationState; // forward declaration + +struct IterationStateFactory +{ + template <typename MyCloth> + IterationStateFactory(MyCloth& cloth, float frameDt); + + template <typename Simd4f, typename MyCloth> + IterationState<Simd4f> create(MyCloth const& cloth) const; + + template <typename Simd4f> + static Simd4f lengthSqr(Simd4f const& v) + { + return dot3(v, v); + } + + template <typename Simd4f> + static PxVec3 castToPxVec3(const Simd4f& v) + { + return *reinterpret_cast<const PxVec3*>(reinterpret_cast<const char*>(&v)); + } + + int mNumIterations; + float mInvNumIterations; + float mIterDt, mIterDtRatio, mIterDtAverage; + PxQuat mCurrentRotation; + PxVec3 mPrevLinearVelocity; + PxVec3 mPrevAngularVelocity; +}; + +/* solver iterations helper functor */ +template <typename Simd4f> +struct IterationState +{ + // call after each iteration + void update(); + + inline float getCurrentAlpha() const; + inline float getPreviousAlpha() const; + + public: + Simd4f mRotationMatrix[3]; // should rename to 'mRotation' + + Simd4f mCurBias; // in local space + Simd4f mPrevBias; // in local space + Simd4f mWind; // delta position per iteration + + Simd4f mPrevMatrix[3]; + Simd4f mCurMatrix[3]; + Simd4f mDampScaleUpdate; + + // iteration counter + uint32_t mRemainingIterations; + + // reciprocal total number of iterations + float mInvNumIterations; + + // time step size per iteration + float mIterDt; + + bool mIsTurning; // if false, mPositionScale = mPrevMatrix[0] +}; + +} // namespace cloth + +template <typename Simd4f> +inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const +{ + return getPreviousAlpha() + mInvNumIterations; +} + +template <typename Simd4f> +inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const +{ + return 1.0f - mRemainingIterations * mInvNumIterations; +} + +template <typename MyCloth> +cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameDt) +{ + mNumIterations = PxMax(1, int(frameDt * cloth.mSolverFrequency + 0.5f)); + mInvNumIterations = 1.0f / mNumIterations; + mIterDt = frameDt * mInvNumIterations; + + mIterDtRatio = cloth.mPrevIterDt ? mIterDt / cloth.mPrevIterDt : 1.0f; + mIterDtAverage = cloth.mIterDtAvg.empty() ? mIterDt : cloth.mIterDtAvg.average(); + + mCurrentRotation = cloth.mCurrentMotion.q; + mPrevLinearVelocity = cloth.mLinearVelocity; + mPrevAngularVelocity = cloth.mAngularVelocity; + + // update cloth + float invFrameDt = 1.0f / frameDt; + cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p); + PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate(); + cloth.mAngularVelocity = log(dq) * invFrameDt; + + cloth.mPrevIterDt = mIterDt; + cloth.mIterDtAvg.push(static_cast<uint32_t>(mNumIterations), mIterDt); + cloth.mCurrentMotion = cloth.mTargetMotion; +} + +/* +momentum conservation: +m2*x2 - m1*x1 = m1*x1 - m0*x0 + g*dt2, m = r+t +r2*x2+t2 = 2(r1*x1+t1) - (r0*x0+t0) + g*dt2 +r2*x2 = r1*x1 + r1*x1 - r0*x0 - (t2-2t1+t0) + g*dt2 +substitue r1*x1 - r0*x0 = r1*(x1-x0) + (r1-r0)*x0 +and r1*x1 = r2*x1 - (r2-r1)*x1 + +x2 = x1 + r2'*g*dt2 + + r2'r1*(x1-x0) //< damp + + (r2'r1-r2'r0)*x0 - (1-r2'r1)*x1 - r2'*(t2-2t1+t0) //< inertia + + (1-r2'r1)x1 + t2-t1 //< drag (not momentum conserving) + +x2 = x0 + a0*x0 + a1*x1 + b with +a0 = (inertia-damp)*r2'r1 - inertia*r2'r0 - eye +a1 = (1-inertia-drag)*eye + (damp+inertia+drag)*r2'r1 +b = r2'*(g*dt2 - (inertia+drag)*(t2-t1) + inertia*(t1-t0)) + +Velocities are used to deal with multiple iterations and varying dt. Only b needs +to updated from one iteration to the next. Specifically, it is multiplied +by (r2'r1)^1/numIterations. a0 and a1 are unaffected by that multiplication. + +The centrifugal and coriolis forces of non-inertial (turning) reference frame are +not generally captured in these formulas. The 'inertia' term above contains radial +acceleration plus centrifugal and coriolis force for a single iteration. +For multiple iterations, or when the centrifugal forces are scaled differently +than angular inertia, we need to add explicit centrifugal and coriolis forces. +We only use them to correct the above formula because their discretization is +not accurate. + +Possible improvements: multiply coriolis and centrifugal matrix by curInvRotation +from the left. Do the alpha trick of linearInertia also for angularInertia, write +prevParticle after multiplying it with matrix. + +If you change anything in this function, make sure that ClothCustomFloating and +ClothInertia haven't regressed for any choice of solver frequency. +*/ + +template <typename Simd4f, typename MyCloth> +cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const +{ + IterationState<Simd4f> result; + + result.mRemainingIterations = static_cast<uint32_t>(mNumIterations); + result.mInvNumIterations = mInvNumIterations; + result.mIterDt = mIterDt; + + Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity)); + Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity)); + + Simd4f iterDt = simd4f(mIterDt); + Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt; + + Simd4f translation = iterDt * curLinearVelocity; + + // gravity delta per iteration + Simd4f gravity = load(array(cloth.mGravity)) * static_cast<Simd4f>(simd4f(sqr(mIterDtAverage))); + + // scale of local particle velocity per iteration + Simd4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent); + // adjust for the change in time step during the first iteration + Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio); + + // portion of negative frame velocity to transfer to particle + Simd4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation; + + // portion of frame acceleration to transfer to particle + Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity); + + // for inertia, we want to violate newton physics to + // match velocity and position as given by the user, which means: + // vt = v0 + a*t and xt = x0 + v0*t + (!) a*t^2 + // this is achieved by applying a different portion to cur and prev + // position, compared to the normal +0.5 and -0.5 for '... 1/2 a*t^2'. + // specifically, the portion is alpha=(n+1)/2n and 1-alpha. + + float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations; + Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha); + + // rotate to local space (use mRotationMatrix temporarily to hold matrix) + PxMat44 invRotation(mCurrentRotation.getConjugate()); + assign(result.mRotationMatrix, invRotation); + + Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); + + // Previously, we split the bias between previous and current position to + // get correct disretized position and velocity. However, this made a + // hanging cloth experience a downward velocity, which is problematic + // when scaled by the iterDt ratio and results in jitter under variable + // timesteps. Instead, we now apply the entire bias to current position + // and accept a less noticeable error for a free falling cloth. + + Simd4f bias = gravity - linearDrag; + result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ; + result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ; + + Simd4f wind = load(array(cloth.mWind)) * iterDt; + result.mWind = transform(result.mRotationMatrix, translation - wind) & maskXYZ; + + result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f; + + if(result.mIsTurning) + { + Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity))); + Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity))); + + // rotation for one iteration in local space + Simd4f curInvAngle = -iterDt * curAngularVelocity; + Simd4f prevInvAngle = -iterDt * prevAngularVelocity; + + PxQuat curInvRotation = exp(castToPxVec3(curInvAngle)); + PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle)); + + PxMat44 curMatrix(curInvRotation); + PxMat44 prevMatrix(prevInvRotation * curInvRotation); + + assign(result.mRotationMatrix, curMatrix); + + Simd4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent); + Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia)); + Simd4f angularInertia = load(array(cloth.mAngularInertia)); + Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity; + + Simd4f epsilon = simd4f(PxSqrt(FLT_MIN)); // requirement: sqr(epsilon) > 0 + Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon; + Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon; + Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon; + Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon; + Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon; + + float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0]; + float inertiaScale = + mInvNumIterations * array(rsqrt(accelerationLengthSqr * inertiaLengthSqr) * inertiaLengthSqr)[0]; + + // magic factor found by comparing to global space simulation: + // some centrifugal force is in inertia part, remainder is 2*(n-1)/n + // after scaling the inertia part, we get for centrifugal: + float centrifugalAlpha = (2 * mNumIterations - 1) * mInvNumIterations; + float centrifugalScale = + centrifugalAlpha * array(rsqrt(velocityLengthSqr * centrifugalLengthSqr) * centrifugalLengthSqr)[0] - + inertiaScale; + + // slightly better in ClothCustomFloating than curInvAngle alone + Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f); + const Simd4f data = lengthSqr(centrifugalVelocity); + float centrifugalSqrLength = array(data)[0] * centrifugalScale; + + Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale); + PxMat33 coriolisMatrix = shdfnd::star(castToPxVec3(coriolisVelocity)); + + const float* dampScalePtr = array(firstDampScale); + const float* centrifugalPtr = array(centrifugalVelocity); + + for(unsigned int j = 0; j < 3; ++j) + { + float centrifugalJ = -centrifugalPtr[j] * centrifugalScale; + for(unsigned int i = 0; i < 3; ++i) + { + float damping = dampScalePtr[j]; + float coriolis = coriolisMatrix(i, j); + float centrifugal = centrifugalPtr[i] * centrifugalJ; + + prevMatrix(i, j) = centrifugal - coriolis + curMatrix(i, j) * (inertiaScale - damping) - + prevMatrix(i, j) * inertiaScale; + curMatrix(i, j) = centrifugal + coriolis + curMatrix(i, j) * (inertiaScale + damping + dragScale); + } + curMatrix(j, j) += centrifugalSqrLength - inertiaScale - dragScale; + prevMatrix(j, j) += centrifugalSqrLength; + } + + assign(result.mPrevMatrix, prevMatrix); + assign(result.mCurMatrix, curMatrix); + } + else + { + Simd4f minusOne = -static_cast<Simd4f>(gSimd4fOne); + result.mRotationMatrix[0] = minusOne; + result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne); + } + + // difference of damp scale between first and other iterations + result.mDampScaleUpdate = (dampScale - firstDampScale) & maskXYZ; + + return result; +} + +template <typename Simd4f> +void cloth::IterationState<Simd4f>::update() +{ + if(mIsTurning) + { + // only need to turn bias, matrix is unaffected (todo: verify) + mCurBias = transform(mRotationMatrix, mCurBias); + mPrevBias = transform(mRotationMatrix, mPrevBias); + mWind = transform(mRotationMatrix, mWind); + } + + // remove time step ratio in damp scale after first iteration + for(uint32_t i = 0; i < 3; ++i) + { + mPrevMatrix[i] = mPrevMatrix[i] - mRotationMatrix[i] * mDampScaleUpdate; + mCurMatrix[i] = mCurMatrix[i] + mRotationMatrix[i] * mDampScaleUpdate; + } + mDampScaleUpdate = gSimd4fZero; // only once + + --mRemainingIterations; +} + +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/MovingAverage.h b/PhysX_3.4/Source/LowLevelCloth/src/MovingAverage.h new file mode 100644 index 00000000..45d33322 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/MovingAverage.h @@ -0,0 +1,145 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Allocator.h" + +namespace physx +{ +namespace cloth +{ + +struct MovingAverage +{ + struct Element + { + uint32_t mCount; + float mValue; + }; + + public: + MovingAverage(uint32_t n = 1) : mCount(0), mSize(n) + { + } + + bool empty() const + { + return mData.empty(); + } + + uint32_t size() const + { + return mSize; + } + + void resize(uint32_t n) + { + PX_ASSERT(n); + mSize = n; + trim(); + } + + void reset() + { + mData.resize(0); + mCount = 0; + } + + void push(uint32_t n, float value) + { + n = PxMin(n, mSize); + + if(mData.empty() || mData.back().mValue != value) + { + Element element = { n, value }; + mData.pushBack(element); + } + else + { + mData.back().mCount += n; + } + + mCount += n; + trim(); + } + + float average() const + { + PX_ASSERT(!mData.empty()); + + float sum = 0.0f; + Vector<Element>::Type::ConstIterator it = mData.begin(), end = mData.end(); + for(; it != end; ++it) + sum += it->mCount * it->mValue; + + // linear weight ramps at both ends for smoother average + uint32_t n = mCount / 8; + float ramp = 0.0f, temp = 0.0f; + uint32_t countLo = (it = mData.begin())->mCount; + uint32_t countHi = (--end)->mCount; + for(uint32_t i = 0; i < n; ++i) + { + if(i == countLo) + countLo += (++it)->mCount; + if(i == countHi) + countHi += (--end)->mCount; + + temp += it->mValue + end->mValue; + ramp += temp; + } + + uint32_t num = (mCount - n) * (n + 1); + return (sum * (n + 1) - ramp) / num; + } + + private: + // remove oldest (front) values until mCount<=mSize + void trim() + { + Vector<Element>::Type::Iterator it = mData.begin(); + for(uint32_t k = mSize; k < mCount; it += k <= mCount) + { + k += it->mCount; + it->mCount = k - mCount; + } + + if(it != mData.begin()) + mData.assign(it, mData.end()); + + mCount = PxMin(mCount, mSize); + } + + Vector<Element>::Type mData; + + uint32_t mCount; + uint32_t mSize; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/PhaseConfig.cpp b/PhysX_3.4/Source/LowLevelCloth/src/PhaseConfig.cpp new file mode 100644 index 00000000..354f445e --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/PhaseConfig.cpp @@ -0,0 +1,75 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "PhaseConfig.h" +#include "PsMathUtils.h" + +namespace physx +{ +namespace cloth +{ +PhaseConfig transform(const PhaseConfig&); +} +} + +using namespace physx; + +namespace +{ +float safeLog2(float x) +{ + float saturated = PxMax(0.0f, PxMin(x, 1.0f)); + return saturated ? shdfnd::log2(saturated) : -FLT_MAX_EXP; +} +} + +cloth::PhaseConfig::PhaseConfig(uint16_t index) +: mPhaseIndex(index) +, mPadding(0xffff) +, mStiffness(1.0f) +, mStiffnessMultiplier(1.0f) +, mCompressionLimit(1.0f) +, mStretchLimit(1.0f) +{ +} + +// convert from user input to solver format +cloth::PhaseConfig cloth::transform(const PhaseConfig& config) +{ + PhaseConfig result(config.mPhaseIndex); + + result.mStiffness = safeLog2(1.0f - config.mStiffness); + result.mStiffnessMultiplier = safeLog2(config.mStiffnessMultiplier); + + // negative for compression, positive for stretch + result.mCompressionLimit = 1 - 1 / config.mCompressionLimit; + result.mStretchLimit = 1 - 1 / config.mStretchLimit; + + return result; +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/PointInterpolator.h b/PhysX_3.4/Source/LowLevelCloth/src/PointInterpolator.h new file mode 100644 index 00000000..b86c7442 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/PointInterpolator.h @@ -0,0 +1,168 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ + +namespace cloth +{ + +// acts as a poor mans random access iterator +template <typename Simd4f, typename BaseIterator> +class LerpIterator +{ + + LerpIterator& operator=(const LerpIterator&); // not implemented + + public: + LerpIterator(BaseIterator start, BaseIterator target, float alpha) + : mAlpha(simd4f(alpha)), mStart(start), mTarget(target) + { + } + + // return the interpolated point at a given index + inline Simd4f operator[](size_t index) const + { + return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha; + } + + inline Simd4f operator*() const + { + return (*this)[0]; + } + + // prefix increment only + inline LerpIterator& operator++() + { + ++mStart; + ++mTarget; + return *this; + } + + private: + // interpolation parameter + const Simd4f mAlpha; + + BaseIterator mStart; + BaseIterator mTarget; +}; + +template <typename Simd4f, size_t Stride> +class UnalignedIterator +{ + + UnalignedIterator& operator=(const UnalignedIterator&); // not implemented + + public: + UnalignedIterator(const float* pointer) : mPointer(pointer) + { + } + + inline Simd4f operator[](size_t index) const + { + return load(mPointer + index * Stride); + } + + inline Simd4f operator*() const + { + return (*this)[0]; + } + + // prefix increment only + inline UnalignedIterator& operator++() + { + mPointer += Stride; + return *this; + } + + private: + const float* mPointer; +}; + +// acts as an iterator but returns a constant +template <typename Simd4f> +class ConstantIterator +{ + public: + ConstantIterator(const Simd4f& value) : mValue(value) + { + } + + inline Simd4f operator*() const + { + return mValue; + } + + inline ConstantIterator& operator++() + { + return *this; + } + + private: + ConstantIterator& operator=(const ConstantIterator&); + const Simd4f mValue; +}; + +// wraps an iterator with constant scale and bias +template <typename Simd4f, typename BaseIterator> +class ScaleBiasIterator +{ + public: + ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias) + : mScale(scale), mBias(bias), mBaseIterator(base) + { + } + + inline Simd4f operator*() const + { + return (*mBaseIterator) * mScale + mBias; + } + + inline ScaleBiasIterator& operator++() + { + ++mBaseIterator; + return *this; + } + + private: + ScaleBiasIterator& operator=(const ScaleBiasIterator&); + + const Simd4f mScale; + const Simd4f mBias; + + BaseIterator mBaseIterator; +}; + +} // namespace cloth + +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Simd.h b/PhysX_3.4/Source/LowLevelCloth/src/Simd.h new file mode 100644 index 00000000..299ea2a9 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/Simd.h @@ -0,0 +1,43 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// cloth solver is 50% slower (!) on MSVC 11 and earlier when Simd4f lives in a namespace +#define NV_SIMD_USE_NAMESPACE 0 + +#include "NvSimd4f.h" +#include "NvSimd4i.h" + +namespace physx +{ +#if NV_SIMD_USE_NAMESPACE +using namespace nvidia::simd; +#endif +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/StackAllocator.h b/PhysX_3.4/Source/LowLevelCloth/src/StackAllocator.h new file mode 100644 index 00000000..eb8d8679 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/StackAllocator.h @@ -0,0 +1,155 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxAssert.h" + +#if PX_LINUX_FAMILY +#include <stdint.h> // intptr_t +#endif + +template <size_t align> +class StackAllocator +{ + typedef unsigned char byte; + + // todo: switch to offsets so size is consistent on x64 + // mSize is just for book keeping so could be 4 bytes + struct Header + { + Header* mPrev; + size_t mSize : 31; + size_t mFree : 1; + }; + + StackAllocator(const StackAllocator&); + StackAllocator& operator=(const StackAllocator&); + + public: + StackAllocator(void* buffer, size_t bufferSize) + : mBuffer(reinterpret_cast<byte*>(buffer)), mBufferSize(bufferSize), mFreeStart(mBuffer), mTop(0) + { + } + + ~StackAllocator() + { + PX_ASSERT(userBytes() == 0); + } + + void* allocate(size_t numBytes) + { + // this is non-standard + if(!numBytes) + return 0; + + uintptr_t unalignedStart = uintptr_t(mFreeStart) + sizeof(Header); + + byte* allocStart = reinterpret_cast<byte*>((unalignedStart + (align - 1)) & ~(align - 1)); + byte* allocEnd = allocStart + numBytes; + + // ensure there is space for the alloc + PX_ASSERT(allocEnd <= mBuffer + mBufferSize); + + Header* h = getHeader(allocStart); + h->mPrev = mTop; + h->mSize = numBytes; + h->mFree = false; + + mTop = h; + mFreeStart = allocEnd; + + return allocStart; + } + + void deallocate(void* p) + { + if(!p) + return; + + Header* h = getHeader(p); + h->mFree = true; + + // unwind the stack to the next live alloc + while(mTop && mTop->mFree) + { + mFreeStart = reinterpret_cast<byte*>(mTop); + mTop = mTop->mPrev; + } + } + + private: + // return the header for an allocation + inline Header* getHeader(void* p) const + { + PX_ASSERT((reinterpret_cast<uintptr_t>(p) & (align - 1)) == 0); + PX_ASSERT(reinterpret_cast<byte*>(p) >= mBuffer + sizeof(Header)); + PX_ASSERT(reinterpret_cast<byte*>(p) < mBuffer + mBufferSize); + + return reinterpret_cast<Header*>(p) - 1; + } + + public: + // total user-allocated bytes not including any overhead + size_t userBytes() const + { + size_t total = 0; + Header* iter = mTop; + while(iter) + { + total += iter->mSize; + iter = iter->mPrev; + } + + return total; + } + + // total user-allocated bytes + overhead + size_t totalUsedBytes() const + { + return mFreeStart - mBuffer; + } + + size_t remainingBytes() const + { + return mBufferSize - totalUsedBytes(); + } + + size_t wastedBytes() const + { + return totalUsedBytes() - userBytes(); + } + + private: + byte* const mBuffer; + const size_t mBufferSize; + + byte* mFreeStart; // start of free space + Header* mTop; // top allocation header +}; diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.cpp new file mode 100644 index 00000000..1f3d4c90 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.cpp @@ -0,0 +1,305 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwCloth.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "TripletScheduler.h" +#include "ClothBase.h" +#include "PsUtilities.h" + +namespace physx +{ +namespace cloth +{ +PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp +} +} + +using namespace physx; +using namespace shdfnd; + +cloth::SwCloth::SwCloth(SwFactory& factory, SwFabric& fabric, Range<const PxVec4> particles) +: mFactory(factory), mFabric(fabric), mNumVirtualParticles(0), mUserData(0) +{ + PX_ASSERT(!particles.empty()); + + initialize(*this, particles.begin(), particles.end()); + +#if PX_WINDOWS + const uint32_t kSimdWidth = 8; // avx +#else + const uint32_t kSimdWidth = 4; // sse +#endif + + mCurParticles.reserve(particles.size() + kSimdWidth - 1); + mCurParticles.assign(reinterpret_cast<const PxVec4*>(particles.begin()), + reinterpret_cast<const PxVec4*>(particles.end())); + + // 7 dummy particles used in SIMD solver + mCurParticles.resize(particles.size() + kSimdWidth - 1, PxVec4(0.0f)); + mPrevParticles = mCurParticles; + + mCurParticles.resize(particles.size()); + mPrevParticles.resize(particles.size()); + + mFabric.incRefCount(); +} + +namespace +{ +// copy vector and make same capacity +void copyVector(cloth::Vec4fAlignedVector& dst, const cloth::Vec4fAlignedVector& src) +{ + dst.reserve(src.capacity()); + dst.assign(src.begin(), src.end()); + + // ensure valid dummy data + dst.resize(src.capacity(), PxVec4(0.0f)); + dst.resize(src.size()); +} +} + +// copy constructor, supports rebinding to a different factory +cloth::SwCloth::SwCloth(SwFactory& factory, const SwCloth& cloth) +: mFactory(factory) +, mFabric(cloth.mFabric) +, mPhaseConfigs(cloth.mPhaseConfigs) +, mCapsuleIndices(cloth.mCapsuleIndices) +, mStartCollisionSpheres(cloth.mStartCollisionSpheres) +, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres) +, mStartCollisionPlanes(cloth.mStartCollisionPlanes) +, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes) +, mStartCollisionTriangles(cloth.mStartCollisionTriangles) +, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles) +, mVirtualParticleIndices(cloth.mVirtualParticleIndices) +, mVirtualParticleWeights(cloth.mVirtualParticleWeights) +, mNumVirtualParticles(cloth.mNumVirtualParticles) +, mSelfCollisionIndices(cloth.mSelfCollisionIndices) +, mRestPositions(cloth.mRestPositions) +{ + copy(*this, cloth); + + // carry over capacity (using as dummy particles) + copyVector(mCurParticles, cloth.mCurParticles); + copyVector(mPrevParticles, cloth.mPrevParticles); + copyVector(mMotionConstraints.mStart, cloth.mMotionConstraints.mStart); + copyVector(mMotionConstraints.mTarget, cloth.mMotionConstraints.mTarget); + copyVector(mSeparationConstraints.mStart, cloth.mSeparationConstraints.mStart); + copyVector(mSeparationConstraints.mTarget, cloth.mSeparationConstraints.mTarget); + copyVector(mParticleAccelerations, cloth.mParticleAccelerations); + + mFabric.incRefCount(); +} + +cloth::SwCloth::~SwCloth() +{ + mFabric.decRefCount(); +} + +cloth::Range<PxVec4> cloth::SwCloth::push(SwConstraints& constraints) +{ + uint32_t n = mCurParticles.size(); + + if(!constraints.mTarget.capacity()) + constraints.mTarget.resize((n + 3) & ~3, PxVec4(0.0f)); // reserve multiple of 4 for SIMD + + constraints.mTarget.resizeUninitialized(n); + PxVec4* data = &constraints.mTarget.front(); + Range<PxVec4> result(data, data + constraints.mTarget.size()); + + if(constraints.mStart.empty()) // initialize start first + constraints.mStart.swap(constraints.mTarget); + + return result; +} + +void cloth::SwCloth::clear(SwConstraints& constraints) +{ + Vec4fAlignedVector().swap(constraints.mStart); + Vec4fAlignedVector().swap(constraints.mTarget); +} + +cloth::Range<const PxVec3> cloth::SwCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t) +{ + return range; +} + +#include "ClothImpl.h" + +namespace physx +{ +namespace cloth +{ + +template <> +Cloth* ClothImpl<SwCloth>::clone(Factory& factory) const +{ + return factory.clone(*this); +} + +template <> +uint32_t ClothImpl<SwCloth>::getNumParticles() const +{ + return mCloth.mCurParticles.size(); +} + +template <> +void ClothImpl<SwCloth>::lockParticles() const +{ +} + +template <> +void ClothImpl<SwCloth>::unlockParticles() const +{ +} + +template <> +MappedRange<PxVec4> ClothImpl<SwCloth>::getCurrentParticles() +{ + return getMappedParticles(&mCloth.mCurParticles.front()); +} + +template <> +MappedRange<const PxVec4> ClothImpl<SwCloth>::getCurrentParticles() const +{ + return getMappedParticles(&mCloth.mCurParticles.front()); +} + +template <> +MappedRange<PxVec4> ClothImpl<SwCloth>::getPreviousParticles() +{ + return getMappedParticles(&mCloth.mPrevParticles.front()); +} + +template <> +MappedRange<const PxVec4> ClothImpl<SwCloth>::getPreviousParticles() const +{ + return getMappedParticles(&mCloth.mPrevParticles.front()); +} + +template <> +GpuParticles ClothImpl<SwCloth>::getGpuParticles() +{ + GpuParticles result = { 0, 0, 0 }; + return result; +} + +template <> +void ClothImpl<SwCloth>::setPhaseConfig(Range<const PhaseConfig> configs) +{ + mCloth.mPhaseConfigs.resize(0); + + // transform phase config to use in solver + for(; !configs.empty(); configs.popFront()) + if(configs.front().mStiffness > 0.0f) + mCloth.mPhaseConfigs.pushBack(transform(configs.front())); + + mCloth.wakeUp(); +} + +template <> +void ClothImpl<SwCloth>::setSelfCollisionIndices(Range<const uint32_t> indices) +{ + ContextLockType lock(mCloth.mFactory); + mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end()); + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <> +uint32_t ClothImpl<SwCloth>::getNumVirtualParticles() const +{ + return uint32_t(mCloth.mNumVirtualParticles); +} + +template <> +Range<PxVec4> ClothImpl<SwCloth>::getParticleAccelerations() +{ + if(mCloth.mParticleAccelerations.empty()) + { + uint32_t n = mCloth.mCurParticles.size(); + mCloth.mParticleAccelerations.resize(n, PxVec4(0.0f)); + } + + mCloth.wakeUp(); + + PxVec4* data = &mCloth.mParticleAccelerations.front(); + return Range<PxVec4>(data, data + mCloth.mParticleAccelerations.size()); +} + +template <> +void ClothImpl<SwCloth>::clearParticleAccelerations() +{ + Vec4fAlignedVector().swap(mCloth.mParticleAccelerations); + mCloth.wakeUp(); +} + +template <> +void ClothImpl<SwCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights) +{ + mCloth.mNumVirtualParticles = 0; + + // shuffle indices to form independent SIMD sets + uint16_t numParticles = uint16_t(mCloth.mCurParticles.size()); + TripletScheduler scheduler(indices); + scheduler.simd(numParticles, 4); + + // convert indices to byte offset + Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0); + Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin(); + Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end(); + TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast; + mCloth.mVirtualParticleIndices.resize(0); + mCloth.mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt)); + for(; sIt != sEnd; ++sIt) + { + uint32_t setSize = *sIt; + for(tLast = tIt + setSize; tIt != tLast; ++tIt, ++mCloth.mNumVirtualParticles) + mCloth.mVirtualParticleIndices.pushBack(Vec4us(*tIt)); + mCloth.mVirtualParticleIndices.resize((mCloth.mVirtualParticleIndices.size() + 3) & ~3, dummy); + } + Vector<Vec4us>::Type(mCloth.mVirtualParticleIndices.begin(), mCloth.mVirtualParticleIndices.end()) + .swap(mCloth.mVirtualParticleIndices); + + // precompute 1/dot(w,w) + Vec4fAlignedVector().swap(mCloth.mVirtualParticleWeights); + mCloth.mVirtualParticleWeights.reserve(weights.size()); + for(; !weights.empty(); weights.popFront()) + { + PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front()); + PxReal scale = 1 / w.magnitudeSquared(); + mCloth.mVirtualParticleWeights.pushBack(PxVec4(w.x, w.y, w.z, scale)); + } + + mCloth.notifyChanged(); +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.h b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.h new file mode 100644 index 00000000..05db19d2 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCloth.h @@ -0,0 +1,210 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxTransform.h" +#include "Cloth.h" +#include "Range.h" +#include "MovingAverage.h" +#include "PhaseConfig.h" +#include "IndexPair.h" +#include "Vec4T.h" +#include "Array.h" + +namespace physx +{ + +namespace cloth +{ + +class SwFabric; +class SwFactory; + +typedef AlignedVector<PxVec4, 16>::Type Vec4fAlignedVector; + +struct SwConstraints +{ + void pop() + { + if(!mTarget.empty()) + { + mStart.swap(mTarget); + mTarget.resize(0); + } + } + + Vec4fAlignedVector mStart; + Vec4fAlignedVector mTarget; +}; + +class SwCloth +{ + SwCloth& operator=(const SwCloth&); // not implemented + struct SwContextLock + { + SwContextLock(const SwFactory&) + { + } + }; + + public: + typedef SwFactory FactoryType; + typedef SwFabric FabricType; + typedef SwContextLock ContextLockType; + + typedef Vec4fAlignedVector& MappedVec4fVectorType; + typedef Vector<IndexPair>::Type& MappedIndexVectorType; + + SwCloth(SwFactory&, SwFabric&, Range<const PxVec4>); + SwCloth(SwFactory&, const SwCloth&); + ~SwCloth(); // not virtual on purpose + + public: + bool isSleeping() const + { + return mSleepPassCounter >= mSleepAfterCount; + } + void wakeUp() + { + mSleepPassCounter = 0; + } + + void notifyChanged() + { + } + + void setParticleBounds(const float*); + + Range<PxVec4> push(SwConstraints&); + static void clear(SwConstraints&); + + static Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t); + + public: + SwFactory& mFactory; + SwFabric& mFabric; + + // current and previous-iteration particle positions + Vec4fAlignedVector mCurParticles; + Vec4fAlignedVector mPrevParticles; + + PxVec3 mParticleBoundsCenter; + PxVec3 mParticleBoundsHalfExtent; + + PxVec3 mGravity; + PxVec3 mLogDamping; + PxVec3 mLinearLogDrag; + PxVec3 mAngularLogDrag; + PxVec3 mLinearInertia; + PxVec3 mAngularInertia; + PxVec3 mCentrifugalInertia; + float mSolverFrequency; + float mStiffnessFrequency; + + PxTransform mTargetMotion; + PxTransform mCurrentMotion; + PxVec3 mLinearVelocity; + PxVec3 mAngularVelocity; + + float mPrevIterDt; + MovingAverage mIterDtAvg; + + Vector<PhaseConfig>::Type mPhaseConfigs; // transformed! + + // tether constraints stuff + float mTetherConstraintLogStiffness; + float mTetherConstraintScale; + + // motion constraints stuff + SwConstraints mMotionConstraints; + float mMotionConstraintScale; + float mMotionConstraintBias; + float mMotionConstraintLogStiffness; + + // separation constraints stuff + SwConstraints mSeparationConstraints; + + // particle acceleration stuff + Vec4fAlignedVector mParticleAccelerations; + + // wind + PxVec3 mWind; + float mDragLogCoefficient; + float mLiftLogCoefficient; + + // collision stuff + Vector<IndexPair>::Type mCapsuleIndices; + Vec4fAlignedVector mStartCollisionSpheres; + Vec4fAlignedVector mTargetCollisionSpheres; + Vector<uint32_t>::Type mConvexMasks; + Vec4fAlignedVector mStartCollisionPlanes; + Vec4fAlignedVector mTargetCollisionPlanes; + Vector<PxVec3>::Type mStartCollisionTriangles; + Vector<PxVec3>::Type mTargetCollisionTriangles; + bool mEnableContinuousCollision; + float mCollisionMassScale; + float mFriction; + + // virtual particles + Vector<Vec4us>::Type mVirtualParticleIndices; + Vec4fAlignedVector mVirtualParticleWeights; + uint32_t mNumVirtualParticles; + + // self collision + float mSelfCollisionDistance; + float mSelfCollisionLogStiffness; + + Vector<uint32_t>::Type mSelfCollisionIndices; + + Vec4fAlignedVector mRestPositions; + + // sleeping + uint32_t mSleepTestInterval; // how often to test for movement + uint32_t mSleepAfterCount; // number of tests to pass before sleep + float mSleepThreshold; // max movement delta to pass test + uint32_t mSleepPassCounter; // how many tests passed + uint32_t mSleepTestCounter; // how many iterations since tested + + void* mUserData; + +} PX_ALIGN_SUFFIX(16); + +} // namespace cloth + +// bounds = lower[3], upper[3] +inline void cloth::SwCloth::setParticleBounds(const float* bounds) +{ + for(uint32_t i = 0; i < 3; ++i) + { + mParticleBoundsCenter[i] = (bounds[3 + i] + bounds[i]) * 0.5f; + mParticleBoundsHalfExtent[i] = (bounds[3 + i] - bounds[i]) * 0.5f; + } +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.cpp new file mode 100644 index 00000000..ce44f8d0 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.cpp @@ -0,0 +1,154 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwClothData.h" +#include "SwCloth.h" +#include "SwFabric.h" +#include "PsUtilities.h" +#include "PsMathUtils.h" +#include "CmPhysXCommon.h" + +using namespace physx; + +cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric) +{ + mNumParticles = uint32_t(cloth.mCurParticles.size()); + mCurParticles = array(cloth.mCurParticles.front()); + mPrevParticles = array(cloth.mPrevParticles.front()); + + const float* center = array(cloth.mParticleBoundsCenter); + const float* extent = array(cloth.mParticleBoundsHalfExtent); + for(uint32_t i = 0; i < 3; ++i) + { + mCurBounds[i] = center[i] - extent[i]; + mCurBounds[i + 3] = center[i] + extent[i]; + } + + // avoid reading uninitialized data into mCurBounds, even though it's never used. + mPrevBounds[0] = 0.0f; + + mConfigBegin = cloth.mPhaseConfigs.empty() ? 0 : &cloth.mPhaseConfigs.front(); + mConfigEnd = mConfigBegin + cloth.mPhaseConfigs.size(); + + mPhases = &fabric.mPhases.front(); + mNumPhases = uint32_t(fabric.mPhases.size()); + + mSets = &fabric.mSets.front(); + mNumSets = uint32_t(fabric.mSets.size()); + + mRestvalues = &fabric.mRestvalues.front(); + mNumRestvalues = uint32_t(fabric.mRestvalues.size()); + + mIndices = &fabric.mIndices.front(); + mNumIndices = uint32_t(fabric.mIndices.size()); + + float stiffnessExponent = cloth.mStiffnessFrequency * cloth.mPrevIterDt * 0.69314718055994531f; // logf(2.0f); + + mTethers = fabric.mTethers.begin(); + mNumTethers = uint32_t(fabric.mTethers.size()); + mTetherConstraintStiffness = 1.0f - Ps::exp(stiffnessExponent * cloth.mTetherConstraintLogStiffness); + mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale; + + mTriangles = fabric.mTriangles.begin(); + mNumTriangles = uint32_t(fabric.mTriangles.size()) / 3; + mDragCoefficient = 1.0f - Ps::exp(stiffnessExponent * cloth.mDragLogCoefficient); + mLiftCoefficient = 1.0f - Ps::exp(stiffnessExponent * cloth.mLiftLogCoefficient); + + mStartMotionConstraints = cloth.mMotionConstraints.mStart.size() ? array(cloth.mMotionConstraints.mStart.front()) : 0; + mTargetMotionConstraints = + !cloth.mMotionConstraints.mTarget.empty() ? array(cloth.mMotionConstraints.mTarget.front()) : 0; + mMotionConstraintStiffness = 1.0f - Ps::exp(stiffnessExponent * cloth.mMotionConstraintLogStiffness); + + mStartSeparationConstraints = + cloth.mSeparationConstraints.mStart.size() ? array(cloth.mSeparationConstraints.mStart.front()) : 0; + mTargetSeparationConstraints = + !cloth.mSeparationConstraints.mTarget.empty() ? array(cloth.mSeparationConstraints.mTarget.front()) : 0; + + mParticleAccelerations = cloth.mParticleAccelerations.size() ? array(cloth.mParticleAccelerations.front()) : 0; + + mStartCollisionSpheres = cloth.mStartCollisionSpheres.empty() ? 0 : array(cloth.mStartCollisionSpheres.front()); + mTargetCollisionSpheres = + cloth.mTargetCollisionSpheres.empty() ? mStartCollisionSpheres : array(cloth.mTargetCollisionSpheres.front()); + mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size()); + + mCapsuleIndices = cloth.mCapsuleIndices.empty() ? 0 : &cloth.mCapsuleIndices.front(); + mNumCapsules = uint32_t(cloth.mCapsuleIndices.size()); + + mStartCollisionPlanes = cloth.mStartCollisionPlanes.empty() ? 0 : array(cloth.mStartCollisionPlanes.front()); + mTargetCollisionPlanes = + cloth.mTargetCollisionPlanes.empty() ? mStartCollisionPlanes : array(cloth.mTargetCollisionPlanes.front()); + mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size()); + + mConvexMasks = cloth.mConvexMasks.empty() ? 0 : &cloth.mConvexMasks.front(); + mNumConvexes = uint32_t(cloth.mConvexMasks.size()); + + mStartCollisionTriangles = cloth.mStartCollisionTriangles.empty() ? 0 : array(cloth.mStartCollisionTriangles.front()); + mTargetCollisionTriangles = cloth.mTargetCollisionTriangles.empty() ? mStartCollisionTriangles + : array(cloth.mTargetCollisionTriangles.front()); + mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3; + + mVirtualParticlesBegin = cloth.mVirtualParticleIndices.empty() ? 0 : array(cloth.mVirtualParticleIndices.front()); + mVirtualParticlesEnd = mVirtualParticlesBegin + 4 * cloth.mVirtualParticleIndices.size(); + mVirtualParticleWeights = cloth.mVirtualParticleWeights.empty() ? 0 : array(cloth.mVirtualParticleWeights.front()); + mNumVirtualParticleWeights = uint32_t(cloth.mVirtualParticleWeights.size()); + + mEnableContinuousCollision = cloth.mEnableContinuousCollision; + mCollisionMassScale = cloth.mCollisionMassScale; + mFrictionScale = cloth.mFriction; + + mSelfCollisionDistance = cloth.mSelfCollisionDistance; + mSelfCollisionStiffness = 1.0f - Ps::exp(stiffnessExponent * cloth.mSelfCollisionLogStiffness); + + mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin(); + mNumSelfCollisionIndices = mSelfCollisionIndices ? cloth.mSelfCollisionIndices.size() : mNumParticles; + + mRestPositions = cloth.mRestPositions.size() ? array(cloth.mRestPositions.front()) : 0; + + mSleepPassCounter = cloth.mSleepPassCounter; + mSleepTestCounter = cloth.mSleepTestCounter; +} + +void cloth::SwClothData::reconcile(SwCloth& cloth) const +{ + cloth.setParticleBounds(mCurBounds); + cloth.mSleepTestCounter = mSleepTestCounter; + cloth.mSleepPassCounter = mSleepPassCounter; +} + +void cloth::SwClothData::verify() const +{ + // checks needs to be run after the constructor because + // data isn't immediately available on SPU at that stage + // perhaps a good reason to construct SwClothData on PPU instead + + PX_ASSERT(!mNumCapsules || + mNumSpheres > *shdfnd::maxElement(&mCapsuleIndices->first, &(mCapsuleIndices + mNumCapsules)->first)); + + PX_ASSERT(!mNumConvexes || (1u << mNumPlanes) - 1 >= *shdfnd::maxElement(mConvexMasks, mConvexMasks + mNumConvexes)); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.h b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.h new file mode 100644 index 00000000..e3f503ca --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwClothData.h @@ -0,0 +1,151 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/Px.h" +#include "Types.h" + +namespace physx +{ +namespace simd +{ +} +} + +namespace physx +{ +namespace cloth +{ + +class SwCloth; +class SwFabric; +struct PhaseConfig; +struct IndexPair; +struct SwTether; + +// reference to cloth instance bulk data (POD) +struct SwClothData +{ + SwClothData(SwCloth&, const SwFabric&); + void reconcile(SwCloth&) const; + void verify() const; + + // particle data + uint32_t mNumParticles; + float* mCurParticles; + float* mPrevParticles; + + float mCurBounds[6]; // lower[3], upper[3] + float mPrevBounds[6]; + float mPadding; // write as simd + + // distance constraints + const PhaseConfig* mConfigBegin; + const PhaseConfig* mConfigEnd; + + const uint32_t* mPhases; + uint32_t mNumPhases; + + const uint32_t* mSets; + uint32_t mNumSets; + + const float* mRestvalues; + uint32_t mNumRestvalues; + + const uint16_t* mIndices; + uint32_t mNumIndices; + + const SwTether* mTethers; + uint32_t mNumTethers; + float mTetherConstraintStiffness; + float mTetherConstraintScale; + + // wind data + const uint16_t* mTriangles; + uint32_t mNumTriangles; + float mDragCoefficient; + float mLiftCoefficient; + + // motion constraint data + const float* mStartMotionConstraints; + const float* mTargetMotionConstraints; + float mMotionConstraintStiffness; + + // separation constraint data + const float* mStartSeparationConstraints; + const float* mTargetSeparationConstraints; + + // particle acceleration data + const float* mParticleAccelerations; + + // collision stuff + const float* mStartCollisionSpheres; + const float* mTargetCollisionSpheres; + uint32_t mNumSpheres; + + const IndexPair* mCapsuleIndices; + uint32_t mNumCapsules; + + const float* mStartCollisionPlanes; + const float* mTargetCollisionPlanes; + uint32_t mNumPlanes; + + const uint32_t* mConvexMasks; + uint32_t mNumConvexes; + + const float* mStartCollisionTriangles; + const float* mTargetCollisionTriangles; + uint32_t mNumCollisionTriangles; + + const uint16_t* mVirtualParticlesBegin; + const uint16_t* mVirtualParticlesEnd; + + const float* mVirtualParticleWeights; + uint32_t mNumVirtualParticleWeights; + + bool mEnableContinuousCollision; + float mFrictionScale; + float mCollisionMassScale; + + float mSelfCollisionDistance; + float mSelfCollisionStiffness; + + uint32_t mNumSelfCollisionIndices; + const uint32_t* mSelfCollisionIndices; + + float* mRestPositions; + + // sleep data + uint32_t mSleepPassCounter; + uint32_t mSleepTestCounter; + +} PX_ALIGN_SUFFIX(16); +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp new file mode 100644 index 00000000..e505289f --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.cpp @@ -0,0 +1,1935 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxProfiler.h" +#include "foundation/PxAssert.h" +#include "SwCollision.h" +#include "SwCloth.h" +#include "SwClothData.h" +#include "IterationState.h" +#include "BoundingBox.h" +#include "PointInterpolator.h" +#include "SwCollisionHelpers.h" +#include <cstring> // for memset + +using namespace physx; + +// the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision +template <typename Simd4f> +const Simd4f cloth::SwCollision<Simd4f>::sSkeletonWidth = simd4f(cloth::sqr(1 - 0.2f) - 1); + +#if NV_SIMD_SSE2 +const Simd4i cloth::Gather<Simd4i>::sIntSignBit = simd4i(0x80000000); +const Simd4i cloth::Gather<Simd4i>::sSignedMask = sIntSignBit | simd4i(0x7); +#elif NV_SIMD_NEON +const Simd4i cloth::Gather<Simd4i>::sPack = simd4i(0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c); +const Simd4i cloth::Gather<Simd4i>::sOffset = simd4i(0x03020100); +const Simd4i cloth::Gather<Simd4i>::sShift = simd4i(2); +const Simd4i cloth::Gather<Simd4i>::sMask = simd4i(7); +#endif + +namespace +{ +const Simd4fTupleFactory sMaskX = simd4f(simd4i(~0, 0, 0, 0)); +const Simd4fTupleFactory sMaskZ = simd4f(simd4i(0, 0, ~0, 0)); +const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0)); +const Simd4fTupleFactory gSimd4fOneXYZ = simd4f(1.0f, 1.0f, 1.0f, 0.0f); +const Simd4fScalarFactory sGridLength = simd4f(8 - 1e-3f); // sGridSize +const Simd4fScalarFactory sGridExpand = simd4f(1e-4f); +const Simd4fTupleFactory sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); + +#if PX_PROFILE || PX_DEBUG +template <typename Simd4f> +uint32_t horizontalSum(const Simd4f& x) +{ + const float* p = array(x); + return uint32_t(0.5f + p[0] + p[1] + p[2] + p[3]); +} +#endif + +// 7 elements are written to ptr! +template <typename Simd4f> +void storeBounds(float* ptr, const cloth::BoundingBox<Simd4f>& bounds) +{ + store(ptr, bounds.mLower); + store(ptr + 3, bounds.mUpper); +} +} + +struct cloth::SphereData +{ + PxVec3 center; + float radius; +}; + +struct cloth::ConeData +{ + PxVec3 center; + float radius; // cone radius at center + PxVec3 axis; + float slope; // tan(alpha) + + float sqrCosine; // cos^2(alpha) + float halfLength; + + uint32_t firstMask; + uint32_t bothMask; +}; + +struct cloth::TriangleData +{ + PxVec3 base; + float edge0DotEdge1; + + PxVec3 edge0; + float edge0SqrLength; + + PxVec3 edge1; + float edge1SqrLength; + + PxVec3 normal; + float padding; + + float det; + float denom; + + float edge0InvSqrLength; + float edge1InvSqrLength; +}; + +namespace physx +{ +namespace cloth +{ +template <typename Simd4f> +BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bbox, const SphereData* sIt, const SphereData* sEnd) +{ + BoundingBox<Simd4f> result = bbox; + for(; sIt != sEnd; ++sIt) + { + Simd4f p = loadAligned(array(sIt->center)); + Simd4f r = splat<3>(p); + result.mLower = min(result.mLower, p - r); + result.mUpper = max(result.mUpper, p + r); + } + return result; +} +} +} + +namespace +{ +template <typename Simd4f, typename SrcIterator> +void generateSpheres(Simd4f* dIt, const SrcIterator& src, uint32_t count) +{ + // have to copy out iterator to ensure alignment is maintained + for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt) + *dIt = max(sMinusFloatMaxXYZ, *sIt); // clamp radius to 0 +} + +void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, const cloth::IndexPair* capsuleIndices, + uint32_t numCones) +{ + cloth::ConeData* cIt = dst; + for(const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt) + { + PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]); + PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]); + + PxVec4 center = (second + first) * 0.5f; + PxVec4 axis = (second - first) * 0.5f; + + float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z; + float sqrConeLength = sqrAxisLength - cloth::sqr(axis.w); + + float invAxisLength = 1 / sqrtf(sqrAxisLength); + float invConeLength = 1 / sqrtf(sqrConeLength); + + if(sqrConeLength <= 0.0f) + invAxisLength = invConeLength = 0.0f; + + float axisLength = sqrAxisLength * invAxisLength; + float slope = axis.w * invConeLength; + + cIt->center = PxVec3(center.x, center.y, center.z); + cIt->radius = (axis.w + first.w) * invConeLength * axisLength; + cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength; + cIt->slope = slope; + + cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisLength); + cIt->halfLength = axisLength; + + uint32_t firstMask = 0x1u << iIt->first; + cIt->firstMask = firstMask; + cIt->bothMask = firstMask | 0x1u << iIt->second; + } +} + +template <typename Simd4f, typename SrcIterator> +void generatePlanes(Simd4f* dIt, const SrcIterator& src, uint32_t count) +{ + // have to copy out iterator to ensure alignment is maintained + for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt) + *dIt = *sIt; +} + +template <typename Simd4f, typename SrcIterator> +void generateTriangles(cloth::TriangleData* dIt, const SrcIterator& src, uint32_t count) +{ + // have to copy out iterator to ensure alignment is maintained + for(SrcIterator sIt = src; 0 < count--; ++dIt) + { + Simd4f p0 = *sIt; + ++sIt; + Simd4f p1 = *sIt; + ++sIt; + Simd4f p2 = *sIt; + ++sIt; + + Simd4f edge0 = p1 - p0; + Simd4f edge1 = p2 - p0; + Simd4f normal = cross3(edge0, edge1); + + Simd4f edge0SqrLength = dot3(edge0, edge0); + Simd4f edge1SqrLength = dot3(edge1, edge1); + Simd4f edge0DotEdge1 = dot3(edge0, edge1); + Simd4f normalInvLength = rsqrt(dot3(normal, normal)); + + Simd4f det = edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1; + Simd4f denom = edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1; + + // there are definitely faster ways... + Simd4f aux = select(sMaskX, det, denom); + aux = select(sMaskZ, edge0SqrLength, aux); + aux = select(sMaskW, edge1SqrLength, aux); + + storeAligned(&dIt->base.x, select(sMaskW, edge0DotEdge1, p0)); + storeAligned(&dIt->edge0.x, select(sMaskW, edge0SqrLength, edge0)); + storeAligned(&dIt->edge1.x, select(sMaskW, edge1SqrLength, edge1)); + storeAligned(&dIt->normal.x, normal * normalInvLength); + storeAligned(&dIt->det, recip<1>(aux)); + } +} + +} // namespace + +template <typename Simd4f> +cloth::SwCollision<Simd4f>::CollisionData::CollisionData() +: mSpheres(0), mCones(0) +{ +} + +template <typename Simd4f> +cloth::SwCollision<Simd4f>::SwCollision(SwClothData& clothData, SwKernelAllocator& alloc) +: mClothData(clothData), mAllocator(alloc) +{ + allocate(mCurData); + + if(mClothData.mEnableContinuousCollision || mClothData.mFrictionScale > 0.0f) + { + allocate(mPrevData); + + generateSpheres(reinterpret_cast<Simd4f*>(mPrevData.mSpheres), + reinterpret_cast<const Simd4f*>(clothData.mStartCollisionSpheres), clothData.mNumSpheres); + + generateCones(mPrevData.mCones, mPrevData.mSpheres, clothData.mCapsuleIndices, clothData.mNumCapsules); + } +} + +template <typename Simd4f> +cloth::SwCollision<Simd4f>::~SwCollision() +{ + deallocate(mCurData); + deallocate(mPrevData); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::operator()(const IterationState<Simd4f>& state) +{ + mNumCollisions = 0; + + collideConvexes(state); // discrete convex collision, no friction + collideTriangles(state); // discrete triangle collision, no friction + + computeBounds(); + + if(!mClothData.mNumSpheres) + return; + + bool lastIteration = state.mRemainingIterations == 1; + + const Simd4f* targetSpheres = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionSpheres); + + // generate sphere and cone collision data + if(!lastIteration) + { + // interpolate spheres + LerpIterator<Simd4f, const Simd4f*> pIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionSpheres), + targetSpheres, state.getCurrentAlpha()); + generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), pIter, mClothData.mNumSpheres); + } + else + { + // otherwise use the target spheres directly + generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), targetSpheres, mClothData.mNumSpheres); + } + + // generate cones even if test below fails because + // continuous collision might need it in next iteration + generateCones(mCurData.mCones, mCurData.mSpheres, mClothData.mCapsuleIndices, mClothData.mNumCapsules); + + if(buildAcceleration()) + { + if(mClothData.mEnableContinuousCollision) + collideContinuousParticles(); + + mergeAcceleration(reinterpret_cast<uint32_t*>(mSphereGrid)); + mergeAcceleration(reinterpret_cast<uint32_t*>(mConeGrid)); + + if(!mClothData.mEnableContinuousCollision) + collideParticles(); + + collideVirtualParticles(); + } + + if(mPrevData.mSpheres) + shdfnd::swap(mCurData, mPrevData); +} + +template <typename Simd4f> +size_t cloth::SwCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth) +{ + size_t numTriangles = cloth.mStartCollisionTriangles.size(); + size_t numPlanes = cloth.mStartCollisionPlanes.size(); + + const size_t kTriangleDataSize = sizeof(TriangleData) * numTriangles; + const size_t kPlaneDataSize = sizeof(PxVec4) * numPlanes * 2; + + return PxMax(kTriangleDataSize, kPlaneDataSize); +} + +template <typename Simd4f> +size_t cloth::SwCollision<Simd4f>::estimatePersistentMemory(const SwCloth& cloth) +{ + size_t numCapsules = cloth.mCapsuleIndices.size(); + size_t numSpheres = cloth.mStartCollisionSpheres.size(); + + size_t sphereDataSize = sizeof(SphereData) * numSpheres * 2; + size_t coneDataSize = sizeof(ConeData) * numCapsules * 2; + + return sphereDataSize + coneDataSize; +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::allocate(CollisionData& data) +{ + data.mSpheres = static_cast<SphereData*>(mAllocator.allocate(sizeof(SphereData) * mClothData.mNumSpheres)); + + data.mCones = static_cast<ConeData*>(mAllocator.allocate(sizeof(ConeData) * mClothData.mNumCapsules)); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::deallocate(const CollisionData& data) +{ + mAllocator.deallocate(data.mSpheres); + mAllocator.deallocate(data.mCones); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::computeBounds() +{ + PX_PROFILE_ZONE("cloth::SwSolverKernel::computeBounds", 0); + + Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles); + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + Simd4f floatMaxXYZ = -static_cast<Simd4f>(sMinusFloatMaxXYZ); + + Simd4f lower = simd4f(FLT_MAX), upper = -lower; + for(; curIt < curEnd; ++curIt, ++prevIt) + { + Simd4f current = *curIt; + lower = min(lower, current); + upper = max(upper, current); + // if(current.w > 0) current.w = previous.w + *curIt = select(current > floatMaxXYZ, *prevIt, current); + } + + BoundingBox<Simd4f> curBounds; + curBounds.mLower = lower; + curBounds.mUpper = upper; + + // don't change this order, storeBounds writes 7 floats + BoundingBox<Simd4f> prevBounds = loadBounds<Simd4f>(mClothData.mCurBounds); + storeBounds(mClothData.mCurBounds, curBounds); + storeBounds(mClothData.mPrevBounds, prevBounds); +} + +namespace +{ +template <typename Simd4i> +Simd4i andNotIsZero(const Simd4i& left, const Simd4i& right) +{ + return (left & ~right) == gSimd4iZero; +} +} + +// build per-axis mask arrays of spheres on the right/left of grid cell +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::buildSphereAcceleration(const SphereData* sIt) +{ + static const int maxIndex = sGridSize - 1; + + const SphereData* sEnd = sIt + mClothData.mNumSpheres; + for(uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1) + { + Simd4f sphere = loadAligned(array(sIt->center)); + Simd4f radius = splat<3>(sphere); + + Simd4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero)); + Simd4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength)); + + const int* firstIdx = array(first); + const int* lastIdx = array(last); + + uint32_t* firstIt = reinterpret_cast<uint32_t*>(mSphereGrid); + uint32_t* lastIt = firstIt + 3 * sGridSize; + + for(uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize) + { + for(int j = firstIdx[i]; j <= maxIndex; ++j) + firstIt[j] |= mask; + + for(int j = lastIdx[i]; j >= 0; --j) + lastIt[j] |= mask; + } + } +} + +// generate cone masks from sphere masks +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::buildConeAcceleration() +{ + const ConeData* coneIt = mCurData.mCones; + const ConeData* coneEnd = coneIt + mClothData.mNumCapsules; + for(uint32_t coneMask = 0x1; coneIt != coneEnd; ++coneIt, coneMask <<= 1) + { + if(coneIt->radius == 0.0f) + continue; + + uint32_t spheresMask = coneIt->bothMask; + + uint32_t* sphereIt = reinterpret_cast<uint32_t*>(mSphereGrid); + uint32_t* sphereEnd = sphereIt + 6 * sGridSize; + uint32_t* gridIt = reinterpret_cast<uint32_t*>(mConeGrid); + for(; sphereIt != sphereEnd; ++sphereIt, ++gridIt) + if(*sphereIt & spheresMask) + *gridIt |= coneMask; + } +} + +// convert right/left mask arrays into single overlap array +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::mergeAcceleration(uint32_t* firstIt) +{ + uint32_t* firstEnd = firstIt + 3 * sGridSize; + uint32_t* lastIt = firstEnd; + for(; firstIt != firstEnd; ++firstIt, ++lastIt) + *firstIt &= *lastIt; +} + +// build mask of spheres/cones touching a regular grid along each axis +template <typename Simd4f> +bool cloth::SwCollision<Simd4f>::buildAcceleration() +{ + // determine sphere bbox + BoundingBox<Simd4f> sphereBounds = + expandBounds(emptyBounds<Simd4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres); + BoundingBox<Simd4f> particleBounds = loadBounds<Simd4f>(mClothData.mCurBounds); + if(mClothData.mEnableContinuousCollision) + { + sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres); + particleBounds = expandBounds(particleBounds, loadBounds<Simd4f>(mClothData.mPrevBounds)); + } + + BoundingBox<Simd4f> bounds = intersectBounds(sphereBounds, particleBounds); + Simd4f edgeLength = (bounds.mUpper - bounds.mLower) & ~static_cast<Simd4f>(sMaskW); + if(!allGreaterEqual(edgeLength, gSimd4fZero)) + return false; + + // calculate an expanded bounds to account for numerical inaccuracy + const Simd4f expandedLower = bounds.mLower - abs(bounds.mLower) * sGridExpand; + const Simd4f expandedUpper = bounds.mUpper + abs(bounds.mUpper) * sGridExpand; + const Simd4f expandedEdgeLength = max(expandedUpper - expandedLower, gSimd4fEpsilon); + + // make grid minimal thickness and strict upper bound of spheres + mGridScale = sGridLength * recip<1>(expandedEdgeLength); + mGridBias = -expandedLower * mGridScale; + array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles() + + PX_ASSERT(allTrue(((bounds.mLower * mGridScale + mGridBias) >= simd4f(0.0f)) | sMaskW)); + PX_ASSERT(allTrue(((bounds.mUpper * mGridScale + mGridBias) < simd4f(8.0f)) | sMaskW)); + + memset(mSphereGrid, 0, sizeof(uint32_t) * 6 * (sGridSize)); + if(mClothData.mEnableContinuousCollision) + buildSphereAcceleration(mPrevData.mSpheres); + buildSphereAcceleration(mCurData.mSpheres); + + memset(mConeGrid, 0, sizeof(uint32_t) * 6 * (sGridSize)); + buildConeAcceleration(); + + return true; +} + +#ifdef _MSC_VER +#define FORCE_INLINE __forceinline +#else +#define FORCE_INLINE inline __attribute__((always_inline)) +#endif + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask:: +operator=(const ShapeMask& right) +{ + mCones = right.mCones; + mSpheres = right.mSpheres; + return *this; +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask:: +operator&=(const ShapeMask& right) +{ + mCones = mCones & right.mCones; + mSpheres = mSpheres & right.mSpheres; + return *this; +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask +cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f& position, const Simd4i* __restrict sphereGrid, + const Simd4i* __restrict coneGrid) +{ + Gather<Simd4i> gather(intFloor(position)); + + ShapeMask result; + result.mCones = gather(coneGrid); + result.mSpheres = gather(sphereGrid); + return result; +} + +// lookup acceleration structure and return mask of potential intersectors +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask +cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict positions) const +{ + Simd4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias); + Simd4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias); + Simd4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias); + + ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid); + result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2); + result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4); + + return result; +} + +// lookup acceleration structure and return mask of potential intersectors +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask +cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict prevPos, const Simd4f* __restrict curPos) const +{ + Simd4f scaleX = splat<0>(mGridScale); + Simd4f scaleY = splat<1>(mGridScale); + Simd4f scaleZ = splat<2>(mGridScale); + + Simd4f biasX = splat<0>(mGridBias); + Simd4f biasY = splat<1>(mGridBias); + Simd4f biasZ = splat<2>(mGridBias); + + Simd4f prevX = prevPos[0] * scaleX + biasX; + Simd4f prevY = prevPos[1] * scaleY + biasY; + Simd4f prevZ = prevPos[2] * scaleZ + biasZ; + + Simd4f curX = curPos[0] * scaleX + biasX; + Simd4f curY = curPos[1] * scaleY + biasY; + Simd4f curZ = curPos[2] * scaleZ + biasZ; + + Simd4f maxX = min(max(prevX, curX), sGridLength); + Simd4f maxY = min(max(prevY, curY), sGridLength); + Simd4f maxZ = min(max(prevZ, curZ), sGridLength); + + ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid); + result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2); + result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4); + + Simd4f zero = gSimd4fZero; + Simd4f minX = max(min(prevX, curX), zero); + Simd4f minY = max(min(prevY, curY), zero); + Simd4f minZ = max(min(prevZ, curZ), zero); + + result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6); + result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8); + result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10); + + return result; +} + +template <typename Simd4f> +struct cloth::SwCollision<Simd4f>::ImpulseAccumulator +{ + ImpulseAccumulator() + : mDeltaX(gSimd4fZero) + , mDeltaY(mDeltaX) + , mDeltaZ(mDeltaX) + , mVelX(mDeltaX) + , mVelY(mDeltaX) + , mVelZ(mDeltaX) + , mNumCollisions(gSimd4fEpsilon) + { + } + + void add(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask) + { + PX_ASSERT(allTrue((mask & x) == (mask & x))); + PX_ASSERT(allTrue((mask & y) == (mask & y))); + PX_ASSERT(allTrue((mask & z) == (mask & z))); + PX_ASSERT(allTrue((mask & scale) == (mask & scale))); + + Simd4f maskedScale = scale & mask; + mDeltaX = mDeltaX + x * maskedScale; + mDeltaY = mDeltaY + y * maskedScale; + mDeltaZ = mDeltaZ + z * maskedScale; + mNumCollisions = mNumCollisions + (gSimd4fOne & mask); + } + + void addVelocity(const Simd4f& vx, const Simd4f& vy, const Simd4f& vz, const Simd4f& mask) + { + PX_ASSERT(allTrue((mask & vx) == (mask & vx))); + PX_ASSERT(allTrue((mask & vy) == (mask & vy))); + PX_ASSERT(allTrue((mask & vz) == (mask & vz))); + + mVelX = mVelX + (vx & mask); + mVelY = mVelY + (vy & mask); + mVelZ = mVelZ + (vz & mask); + } + + void subtract(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask) + { + PX_ASSERT(allTrue((mask & x) == (mask & x))); + PX_ASSERT(allTrue((mask & y) == (mask & y))); + PX_ASSERT(allTrue((mask & z) == (mask & z))); + PX_ASSERT(allTrue((mask & scale) == (mask & scale))); + + Simd4f maskedScale = scale & mask; + mDeltaX = mDeltaX - x * maskedScale; + mDeltaY = mDeltaY - y * maskedScale; + mDeltaZ = mDeltaZ - z * maskedScale; + mNumCollisions = mNumCollisions + (gSimd4fOne & mask); + } + + Simd4f mDeltaX, mDeltaY, mDeltaZ; + Simd4f mVelX, mVelY, mVelZ; + Simd4f mNumCollisions; +}; + +template <typename Simd4f> +FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* positions, + ImpulseAccumulator& accum) const +{ + const float* __restrict spherePtr = array(mCurData.mSpheres->center); + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + Simd4i mask4 = horizontalOr(sphereMask); + uint32_t mask = uint32_t(array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData); + mask = mask & test; + + Simd4f sphere = loadAligned(spherePtr, offset); + + Simd4f deltaX = positions[0] - splat<0>(sphere); + Simd4f deltaY = positions[1] - splat<1>(sphere); + Simd4f deltaZ = positions[2] - splat<2>(sphere); + + Simd4f sqrDistance = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + Simd4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * splat<3>(sphere); + + Simd4f contactMask; + if(!anyGreater(gSimd4fZero, negativeScale, contactMask)) + continue; + + accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask); + + if(frictionEnabled) + { + // load previous sphere pos + const float* __restrict prevSpherePtr = array(mPrevData.mSpheres->center); + + Simd4f prevSphere = loadAligned(prevSpherePtr, offset); + Simd4f velocity = sphere - prevSphere; + + accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask); + } + } +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i +cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict positions, ImpulseAccumulator& accum) const +{ + const float* __restrict centerPtr = array(mCurData.mCones->center); + const float* __restrict axisPtr = array(mCurData.mCones->axis); + const int32_t* __restrict auxiliaryPtr = reinterpret_cast<const int32_t*>(&mCurData.mCones->sqrCosine); + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + ShapeMask shapeMask = getShapeMask(positions); + Simd4i mask4 = horizontalOr(shapeMask.mCones); + uint32_t mask = uint32_t(array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t coneIndex = findBitSet(mask & ~test); + uint32_t offset = coneIndex * sizeof(ConeData); + mask = mask & test; + + Simd4i test4 = mask4 - gSimd4iOne; + Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4)); + mask4 = mask4 & test4; + + Simd4f center = loadAligned(centerPtr, offset); + + Simd4f deltaX = positions[0] - splat<0>(center); + Simd4f deltaY = positions[1] - splat<1>(center); + Simd4f deltaZ = positions[2] - splat<2>(center); + + Simd4f axis = loadAligned(axisPtr, offset); + + Simd4f axisX = splat<0>(axis); + Simd4f axisY = splat<1>(axis); + Simd4f axisZ = splat<2>(axis); + Simd4f slope = splat<3>(axis); + + Simd4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; + Simd4f radius = dot * slope + splat<3>(center); + + // set radius to zero if cone is culled + radius = max(radius, gSimd4fZero) & ~culled; + + Simd4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot; + + Simd4i auxiliary = loadAligned(auxiliaryPtr, offset); + Simd4i bothMask = splat<3>(auxiliary); + + Simd4f contactMask; + if(!anyGreater(radius * radius, sqrDistance, contactMask)) + { + // cone only culled when spheres culled, ok to clear those too + shapeMask.mSpheres = shapeMask.mSpheres & ~bothMask; + continue; + } + + // clamp to a small positive epsilon to avoid numerical error + // making sqrDistance negative when point lies on the cone axis + sqrDistance = max(sqrDistance, gSimd4fEpsilon); + + Simd4f invDistance = rsqrt(sqrDistance); + Simd4f base = dot + slope * sqrDistance * invDistance; + + // force left/rightMask to false if not inside cone + base = base & contactMask; + + Simd4f halfLength = splat<1>(simd4f(auxiliary)); + Simd4i leftMask = simd4i(base < -halfLength); + Simd4i rightMask = simd4i(base > halfLength); + + // we use both mask because of the early out above. + Simd4i firstMask = splat<2>(auxiliary); + Simd4i secondMask = firstMask ^ bothMask; + shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask); + shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask); + + deltaX = deltaX - base * axisX; + deltaY = deltaY - base * axisY; + deltaZ = deltaZ - base * axisZ; + + Simd4f sqrCosine = splat<0>(simd4f(auxiliary)); + Simd4f scale = radius * invDistance * sqrCosine - sqrCosine; + + contactMask = contactMask & ~simd4f(leftMask | rightMask); + + if(!anyTrue(contactMask)) + continue; + + accum.add(deltaX, deltaY, deltaZ, scale, contactMask); + + if(frictionEnabled) + { + uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first; + uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second; + + float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres); + float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres); + + // todo: could pre-compute sphere velocities or it might be + // faster to compute cur/prev sphere positions directly + Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData)); + Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData)); + + Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData)); + Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData)); + + Simd4f v0 = s0p1 - s0p0; + Simd4f v1 = s1p1 - s1p0; + Simd4f vd = v1 - v0; + + // dot is in the range -1 to 1, scale and bias to 0 to 1 + dot = dot * gSimd4fHalf + gSimd4fHalf; + + // interpolate velocity at contact points + Simd4f vx = splat<0>(v0) + dot * splat<0>(vd); + Simd4f vy = splat<1>(v0) + dot * splat<1>(vd); + Simd4f vz = splat<2>(v0) + dot * splat<2>(vd); + + accum.addVelocity(vx, vy, vz, contactMask); + } + } + + return shapeMask.mSpheres; +} + +template <typename Simd4f> +FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* __restrict prevPos, + Simd4f* __restrict curPos, ImpulseAccumulator& accum) const +{ + const float* __restrict prevSpheres = array(mPrevData.mSpheres->center); + const float* __restrict curSpheres = array(mCurData.mSpheres->center); + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + Simd4i mask4 = horizontalOr(sphereMask); + uint32_t mask = uint32_t(array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData); + mask = mask & test; + + Simd4f prevSphere = loadAligned(prevSpheres, offset); + Simd4f prevX = prevPos[0] - splat<0>(prevSphere); + Simd4f prevY = prevPos[1] - splat<1>(prevSphere); + Simd4f prevZ = prevPos[2] - splat<2>(prevSphere); + Simd4f prevRadius = splat<3>(prevSphere); + + Simd4f curSphere = loadAligned(curSpheres, offset); + Simd4f curX = curPos[0] - splat<0>(curSphere); + Simd4f curY = curPos[1] - splat<1>(curSphere); + Simd4f curZ = curPos[2] - splat<2>(curSphere); + Simd4f curRadius = splat<3>(curSphere); + + Simd4f sqrDistance = gSimd4fEpsilon + curX * curX + curY * curY + curZ * curZ; + + Simd4f dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius; + Simd4f dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius; + Simd4f dotCurCur = sqrDistance - curRadius * curRadius; + + Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + Simd4f sqrtD = sqrt(discriminant); + Simd4f halfB = dotPrevCur - dotPrevPrev; + Simd4f minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside sphere + Simd4f toi = recip(minusA) * min(gSimd4fZero, halfB + sqrtD); + Simd4f collisionMask = (toi < gSimd4fOne) & (halfB < sqrtD); + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth); + + // a is negative when one sphere is contained in the other, + // which is already handled by discrete collision. + collisionMask = collisionMask & (minusA < -static_cast<Simd4f>(gSimd4fEpsilon)); + + if(!allEqual(collisionMask, gSimd4fZero)) + { + Simd4f deltaX = prevX - curX; + Simd4f deltaY = prevY - curY; + Simd4f deltaZ = prevZ - curZ; + + Simd4f oneMinusToi = (gSimd4fOne - toi) & collisionMask; + + // reduce ccd impulse if (clamped) particle trajectory stays in sphere skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon); + oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curPos[0] = splat<0>(curSphere) + curX; + curPos[1] = splat<1>(curSphere) + curY; + curPos[2] = splat<2>(curSphere) + curZ; + + sqrDistance = gSimd4fEpsilon + curX * curX + curY * curY + curZ * curZ; + } + + Simd4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * curRadius; + + Simd4f contactMask; + if(!anyGreater(gSimd4fZero, negativeScale, contactMask)) + continue; + + accum.subtract(curX, curY, curZ, negativeScale, contactMask); + + if(frictionEnabled) + { + Simd4f velocity = curSphere - prevSphere; + accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask); + } + } +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i +cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict prevPos, Simd4f* __restrict curPos, + ImpulseAccumulator& accum) const +{ + const float* __restrict prevCenterPtr = array(mPrevData.mCones->center); + const float* __restrict prevAxisPtr = array(mPrevData.mCones->axis); + const int32_t* __restrict prevAuxiliaryPtr = reinterpret_cast<const int32_t*>(&mPrevData.mCones->sqrCosine); + + const float* __restrict curCenterPtr = array(mCurData.mCones->center); + const float* __restrict curAxisPtr = array(mCurData.mCones->axis); + const int32_t* __restrict curAuxiliaryPtr = reinterpret_cast<const int32_t*>(&mCurData.mCones->sqrCosine); + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + ShapeMask shapeMask = getShapeMask(prevPos, curPos); + Simd4i mask4 = horizontalOr(shapeMask.mCones); + uint32_t mask = uint32_t(array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t coneIndex = findBitSet(mask & ~test); + uint32_t offset = coneIndex * sizeof(ConeData); + mask = mask & test; + + Simd4i test4 = mask4 - gSimd4iOne; + Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4)); + mask4 = mask4 & test4; + + Simd4f prevCenter = loadAligned(prevCenterPtr, offset); + Simd4f prevAxis = loadAligned(prevAxisPtr, offset); + Simd4f prevAxisX = splat<0>(prevAxis); + Simd4f prevAxisY = splat<1>(prevAxis); + Simd4f prevAxisZ = splat<2>(prevAxis); + Simd4f prevSlope = splat<3>(prevAxis); + + Simd4f prevX = prevPos[0] - splat<0>(prevCenter); + Simd4f prevY = prevPos[1] - splat<1>(prevCenter); + Simd4f prevZ = prevPos[2] - splat<2>(prevCenter); + Simd4f prevT = prevY * prevAxisZ - prevZ * prevAxisY; + Simd4f prevU = prevZ * prevAxisX - prevX * prevAxisZ; + Simd4f prevV = prevX * prevAxisY - prevY * prevAxisX; + Simd4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; + Simd4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter); + + Simd4f curCenter = loadAligned(curCenterPtr, offset); + Simd4f curAxis = loadAligned(curAxisPtr, offset); + Simd4f curAxisX = splat<0>(curAxis); + Simd4f curAxisY = splat<1>(curAxis); + Simd4f curAxisZ = splat<2>(curAxis); + Simd4f curSlope = splat<3>(curAxis); + Simd4i curAuxiliary = loadAligned(curAuxiliaryPtr, offset); + + Simd4f curX = curPos[0] - splat<0>(curCenter); + Simd4f curY = curPos[1] - splat<1>(curCenter); + Simd4f curZ = curPos[2] - splat<2>(curCenter); + Simd4f curT = curY * curAxisZ - curZ * curAxisY; + Simd4f curU = curZ * curAxisX - curX * curAxisZ; + Simd4f curV = curX * curAxisY - curY * curAxisX; + Simd4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + Simd4f curRadius = curDot * curSlope + splat<3>(curCenter); + + Simd4f curSqrDistance = gSimd4fEpsilon + curT * curT + curU * curU + curV * curV; + + // set radius to zero if cone is culled + prevRadius = max(prevRadius, gSimd4fZero) & ~culled; + curRadius = max(curRadius, gSimd4fZero) & ~culled; + + Simd4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius; + Simd4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius; + Simd4f dotCurCur = curSqrDistance - curRadius * curRadius; + + Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + Simd4f sqrtD = sqrt(discriminant); + Simd4f halfB = dotPrevCur - dotPrevPrev; + Simd4f minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside cone + Simd4f toi = recip(minusA) * min(gSimd4fZero, halfB + sqrtD); + Simd4f collisionMask = (toi < gSimd4fOne) & (halfB < sqrtD); + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + collisionMask = collisionMask & (minusA < -static_cast<Simd4f>(gSimd4fEpsilon)); + + // test if any particle hits infinite cone (and 0<time of impact<1) + if(!allEqual(collisionMask, gSimd4fZero)) + { + Simd4f deltaX = prevX - curX; + Simd4f deltaY = prevY - curY; + Simd4f deltaZ = prevZ - curZ; + + // interpolate delta at toi + Simd4f posX = prevX - deltaX * toi; + Simd4f posY = prevY - deltaY * toi; + Simd4f posZ = prevZ - deltaZ * toi; + + Simd4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary)); + Simd4i prevAuxiliary = loadAligned(prevAuxiliaryPtr, offset); + Simd4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary)); + + Simd4f oneMinusToi = gSimd4fOne - toi; + + // interpolate axis at toi + Simd4f axisX = splat<0>(curScaledAxis) - splat<0>(deltaScaledAxis) * oneMinusToi; + Simd4f axisY = splat<1>(curScaledAxis) - splat<1>(deltaScaledAxis) * oneMinusToi; + Simd4f axisZ = splat<2>(curScaledAxis) - splat<2>(deltaScaledAxis) * oneMinusToi; + Simd4f slope = (prevSlope * oneMinusToi + curSlope * toi); + + Simd4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + Simd4f invHalfLength = rsqrt(sqrHalfLength); + Simd4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength; + + Simd4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot; + Simd4f invDistance = rsqrt(sqrDistance) & (sqrDistance > gSimd4fZero); + + Simd4f base = dot + slope * sqrDistance * invDistance; + Simd4f scale = base * invHalfLength & collisionMask; + + Simd4f cullMask = (abs(scale) < gSimd4fOne) & collisionMask; + + // test if any impact position is in cone section + if(!allEqual(cullMask, gSimd4fZero)) + { + deltaX = deltaX + splat<0>(deltaScaledAxis) * scale; + deltaY = deltaY + splat<1>(deltaScaledAxis) * scale; + deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale; + + oneMinusToi = oneMinusToi & cullMask; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + // oneMinusToi = oneMinusToi * recip(gSimd4fOne - sqrtD * recip(minusA * oneMinusToi)); + Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > gSimd4fEpsilon); + oneMinusToi = oneMinusToi * recip(gSimd4fOne - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + curRadius = curDot * curSlope + splat<3>(curCenter); + curRadius = max(curRadius, gSimd4fZero) & ~culled; + curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot; + + curPos[0] = splat<0>(curCenter) + curX; + curPos[1] = splat<1>(curCenter) + curY; + curPos[2] = splat<2>(curCenter) + curZ; + } + } + + // curPos inside cone (discrete collision) + Simd4f contactMask; + int anyContact = anyGreater(curRadius * curRadius, curSqrDistance, contactMask); + + Simd4i bothMask = splat<3>(curAuxiliary); + + // instead of culling continuous collision for ~collisionMask, and discrete + // collision for ~contactMask, disable both if ~collisionMask & ~contactMask + Simd4i cullMask = bothMask & ~simd4i(collisionMask | contactMask); + shapeMask.mSpheres = shapeMask.mSpheres & ~cullMask; + + if(!anyContact) + continue; + + Simd4f invDistance = rsqrt(curSqrDistance) & (curSqrDistance > gSimd4fZero); + Simd4f base = curDot + curSlope * curSqrDistance * invDistance; + + Simd4f halfLength = splat<1>(simd4f(curAuxiliary)); + Simd4i leftMask = simd4i(base < -halfLength); + Simd4i rightMask = simd4i(base > halfLength); + + // can only skip continuous sphere collision if post-ccd position + // is on code side *and* particle had cone-ccd collision. + Simd4i firstMask = splat<2>(curAuxiliary); + Simd4i secondMask = firstMask ^ bothMask; + cullMask = (firstMask & ~leftMask) | (secondMask & ~rightMask); + shapeMask.mSpheres = shapeMask.mSpheres & ~(cullMask & simd4i(collisionMask)); + + Simd4f deltaX = curX - base * curAxisX; + Simd4f deltaY = curY - base * curAxisY; + Simd4f deltaZ = curZ - base * curAxisZ; + + Simd4f sqrCosine = splat<0>(simd4f(curAuxiliary)); + Simd4f scale = curRadius * invDistance * sqrCosine - sqrCosine; + + contactMask = contactMask & ~simd4f(leftMask | rightMask); + + if(!anyTrue(contactMask)) + continue; + + accum.add(deltaX, deltaY, deltaZ, scale, contactMask); + + if(frictionEnabled) + { + uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first; + uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second; + + float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres); + float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres); + + // todo: could pre-compute sphere velocities or it might be + // faster to compute cur/prev sphere positions directly + Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData)); + Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData)); + + Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData)); + Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData)); + + Simd4f v0 = s0p1 - s0p0; + Simd4f v1 = s1p1 - s1p0; + Simd4f vd = v1 - v0; + + // dot is in the range -1 to 1, scale and bias to 0 to 1 + curDot = curDot * gSimd4fHalf + gSimd4fHalf; + + // interpolate velocity at contact points + Simd4f vx = splat<0>(v0) + curDot * splat<0>(vd); + Simd4f vy = splat<1>(v0) + curDot * splat<1>(vd); + Simd4f vz = splat<2>(v0) + curDot * splat<2>(vd); + + accum.addVelocity(vx, vy, vz, contactMask); + } + } + + return shapeMask.mSpheres; +} + +namespace +{ + +template <typename Simd4f> +PX_INLINE void calculateFrictionImpulse(const Simd4f& deltaX, const Simd4f& deltaY, const Simd4f& deltaZ, + const Simd4f& velX, const Simd4f& velY, const Simd4f& velZ, + const Simd4f* curPos, const Simd4f* prevPos, const Simd4f& scale, + const Simd4f& coefficient, const Simd4f& mask, Simd4f* impulse) +{ + // calculate collision normal + Simd4f deltaSq = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + + Simd4f rcpDelta = rsqrt(deltaSq + gSimd4fEpsilon); + + Simd4f nx = deltaX * rcpDelta; + Simd4f ny = deltaY * rcpDelta; + Simd4f nz = deltaZ * rcpDelta; + + // calculate relative velocity scaled by number of collisions + Simd4f rvx = curPos[0] - prevPos[0] - velX * scale; + Simd4f rvy = curPos[1] - prevPos[1] - velY * scale; + Simd4f rvz = curPos[2] - prevPos[2] - velZ * scale; + + // calculate magnitude of relative normal velocity + Simd4f rvn = rvx * nx + rvy * ny + rvz * nz; + + // calculate relative tangential velocity + Simd4f rvtx = rvx - rvn * nx; + Simd4f rvty = rvy - rvn * ny; + Simd4f rvtz = rvz - rvn * nz; + + // calculate magnitude of vt + Simd4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon); + + // magnitude of friction impulse (cannot be greater than -vt) + Simd4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, gSimd4fMinusOne) & mask; + + impulse[0] = rvtx * j; + impulse[1] = rvty * j; + impulse[2] = rvtz * j; +} + +} // anonymous namespace + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideParticles() +{ + const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f; + const Simd4f massScale = simd4f(mClothData.mCollisionMassScale); + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + Simd4f curPos[4]; + Simd4f prevPos[4]; + + float* __restrict prevIt = mClothData.mPrevParticles; + float* __restrict pIt = mClothData.mCurParticles; + float* __restrict pEnd = pIt + mClothData.mNumParticles * 4; + for(; pIt < pEnd; pIt += 16, prevIt += 16) + { + curPos[0] = loadAligned(pIt, 0); + curPos[1] = loadAligned(pIt, 16); + curPos[2] = loadAligned(pIt, 32); + curPos[3] = loadAligned(pIt, 48); + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + + ImpulseAccumulator accum; + Simd4i sphereMask = collideCones(curPos, accum); + collideSpheres(sphereMask, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + if(frictionEnabled) + { + prevPos[0] = loadAligned(prevIt, 0); + prevPos[1] = loadAligned(prevIt, 16); + prevPos[2] = loadAligned(prevIt, 32); + prevPos[3] = loadAligned(prevIt, 48); + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + + Simd4f frictionImpulse[3]; + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + prevPos[0] = prevPos[0] - frictionImpulse[0]; + prevPos[1] = prevPos[1] - frictionImpulse[1]; + prevPos[2] = prevPos[2] - frictionImpulse[2]; + + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + storeAligned(prevIt, 0, prevPos[0]); + storeAligned(prevIt, 16, prevPos[1]); + storeAligned(prevIt, 32, prevPos[2]); + storeAligned(prevIt, 48, prevPos[3]); + } + + if(massScalingEnabled) + { + // calculate the inverse mass scale based on the collision impulse magnitude + Simd4f dSq = invNumCollisions * invNumCollisions * + (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ); + + Simd4f scale = recip(gSimd4fOne + massScale * dSq); + + // scale invmass + curPos[3] = select(mask, curPos[3] * scale, curPos[3]); + } + + curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions; + curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions; + curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions; + + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + storeAligned(pIt, 0, curPos[0]); + storeAligned(pIt, 16, curPos[1]); + storeAligned(pIt, 32, curPos[2]); + storeAligned(pIt, 48, curPos[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideVirtualParticles() +{ + const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f; + const Simd4f massScale = simd4f(mClothData.mCollisionMassScale); + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + Simd4f curPos[3]; + + const float* __restrict weights = mClothData.mVirtualParticleWeights; + float* __restrict particles = mClothData.mCurParticles; + float* __restrict prevParticles = mClothData.mPrevParticles; + + // move dummy particles outside of collision range + Simd4f* __restrict dummy = mClothData.mNumParticles + reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f invGridScale = recip(mGridScale) & (mGridScale > gSimd4fEpsilon); + dummy[0] = dummy[1] = dummy[2] = invGridScale * mGridBias - invGridScale; + + const uint16_t* __restrict vpIt = mClothData.mVirtualParticlesBegin; + const uint16_t* __restrict vpEnd = mClothData.mVirtualParticlesEnd; + for(; vpIt != vpEnd; vpIt += 16) + { + // load 12 particles and 4 weights + Simd4f p0v0 = loadAligned(particles, vpIt[0] * sizeof(PxVec4)); + Simd4f p0v1 = loadAligned(particles, vpIt[1] * sizeof(PxVec4)); + Simd4f p0v2 = loadAligned(particles, vpIt[2] * sizeof(PxVec4)); + Simd4f w0 = loadAligned(weights, vpIt[3] * sizeof(PxVec4)); + + Simd4f p1v0 = loadAligned(particles, vpIt[4] * sizeof(PxVec4)); + Simd4f p1v1 = loadAligned(particles, vpIt[5] * sizeof(PxVec4)); + Simd4f p1v2 = loadAligned(particles, vpIt[6] * sizeof(PxVec4)); + Simd4f w1 = loadAligned(weights, vpIt[7] * sizeof(PxVec4)); + + Simd4f p2v0 = loadAligned(particles, vpIt[8] * sizeof(PxVec4)); + Simd4f p2v1 = loadAligned(particles, vpIt[9] * sizeof(PxVec4)); + Simd4f p2v2 = loadAligned(particles, vpIt[10] * sizeof(PxVec4)); + Simd4f w2 = loadAligned(weights, vpIt[11] * sizeof(PxVec4)); + + Simd4f p3v1 = loadAligned(particles, vpIt[13] * sizeof(PxVec4)); + Simd4f p3v0 = loadAligned(particles, vpIt[12] * sizeof(PxVec4)); + Simd4f p3v2 = loadAligned(particles, vpIt[14] * sizeof(PxVec4)); + Simd4f w3 = loadAligned(weights, vpIt[15] * sizeof(PxVec4)); + + // interpolate particles and transpose + Simd4f px = p0v0 * splat<0>(w0) + p0v1 * splat<1>(w0) + p0v2 * splat<2>(w0); + Simd4f py = p1v0 * splat<0>(w1) + p1v1 * splat<1>(w1) + p1v2 * splat<2>(w1); + Simd4f pz = p2v0 * splat<0>(w2) + p2v1 * splat<1>(w2) + p2v2 * splat<2>(w2); + Simd4f pw = p3v0 * splat<0>(w3) + p3v1 * splat<1>(w3) + p3v2 * splat<2>(w3); + transpose(px, py, pz, pw); + + curPos[0] = px; + curPos[1] = py; + curPos[2] = pz; + + ImpulseAccumulator accum; + Simd4i sphereMask = collideCones(curPos, accum); + collideSpheres(sphereMask, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + // displacement and transpose back + Simd4f d0 = accum.mDeltaX * invNumCollisions; + Simd4f d1 = accum.mDeltaY * invNumCollisions; + Simd4f d2 = accum.mDeltaZ * invNumCollisions; + Simd4f d3 = gSimd4fZero; + transpose(d0, d1, d2, d3); + + // scale weights by 1/dot(w,w) + Simd4f rw0 = w0 * splat<3>(w0); + Simd4f rw1 = w1 * splat<3>(w1); + Simd4f rw2 = w2 * splat<3>(w2); + Simd4f rw3 = w3 * splat<3>(w3); + + if(frictionEnabled) + { + Simd4f q0v0 = loadAligned(prevParticles, vpIt[0] * sizeof(PxVec4)); + Simd4f q0v1 = loadAligned(prevParticles, vpIt[1] * sizeof(PxVec4)); + Simd4f q0v2 = loadAligned(prevParticles, vpIt[2] * sizeof(PxVec4)); + + Simd4f q1v0 = loadAligned(prevParticles, vpIt[4] * sizeof(PxVec4)); + Simd4f q1v1 = loadAligned(prevParticles, vpIt[5] * sizeof(PxVec4)); + Simd4f q1v2 = loadAligned(prevParticles, vpIt[6] * sizeof(PxVec4)); + + Simd4f q2v0 = loadAligned(prevParticles, vpIt[8] * sizeof(PxVec4)); + Simd4f q2v1 = loadAligned(prevParticles, vpIt[9] * sizeof(PxVec4)); + Simd4f q2v2 = loadAligned(prevParticles, vpIt[10] * sizeof(PxVec4)); + + Simd4f q3v0 = loadAligned(prevParticles, vpIt[12] * sizeof(PxVec4)); + Simd4f q3v1 = loadAligned(prevParticles, vpIt[13] * sizeof(PxVec4)); + Simd4f q3v2 = loadAligned(prevParticles, vpIt[14] * sizeof(PxVec4)); + + // calculate previous interpolated positions + Simd4f qx = q0v0 * splat<0>(w0) + q0v1 * splat<1>(w0) + q0v2 * splat<2>(w0); + Simd4f qy = q1v0 * splat<0>(w1) + q1v1 * splat<1>(w1) + q1v2 * splat<2>(w1); + Simd4f qz = q2v0 * splat<0>(w2) + q2v1 * splat<1>(w2) + q2v2 * splat<2>(w2); + Simd4f qw = q3v0 * splat<0>(w3) + q3v1 * splat<1>(w3) + q3v2 * splat<2>(w3); + transpose(qx, qy, qz, qw); + + Simd4f prevPos[3] = { qx, qy, qz }; + Simd4f frictionImpulse[4]; + frictionImpulse[3] = gSimd4fZero; + + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]); + + q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]); + q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]); + q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]); + + q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]); + q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]); + q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]); + + q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]); + q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]); + q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]); + + q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]); + q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]); + q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]); + + // write back prev particles + storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0); + storeAligned(prevParticles, vpIt[1] * sizeof(PxVec4), q0v1); + storeAligned(prevParticles, vpIt[2] * sizeof(PxVec4), q0v2); + + storeAligned(prevParticles, vpIt[4] * sizeof(PxVec4), q1v0); + storeAligned(prevParticles, vpIt[5] * sizeof(PxVec4), q1v1); + storeAligned(prevParticles, vpIt[6] * sizeof(PxVec4), q1v2); + + storeAligned(prevParticles, vpIt[8] * sizeof(PxVec4), q2v0); + storeAligned(prevParticles, vpIt[9] * sizeof(PxVec4), q2v1); + storeAligned(prevParticles, vpIt[10] * sizeof(PxVec4), q2v2); + + storeAligned(prevParticles, vpIt[12] * sizeof(PxVec4), q3v0); + storeAligned(prevParticles, vpIt[13] * sizeof(PxVec4), q3v1); + storeAligned(prevParticles, vpIt[14] * sizeof(PxVec4), q3v2); + } + + if(massScalingEnabled) + { + // calculate the inverse mass scale based on the collision impulse + Simd4f dSq = invNumCollisions * invNumCollisions * + (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ); + + Simd4f weightScale = recip(gSimd4fOne + massScale * dSq); + + weightScale = weightScale - gSimd4fOne; + Simd4f s0 = gSimd4fOne + splat<0>(weightScale) * (w0 & splat<0>(mask)); + Simd4f s1 = gSimd4fOne + splat<1>(weightScale) * (w1 & splat<1>(mask)); + Simd4f s2 = gSimd4fOne + splat<2>(weightScale) * (w2 & splat<2>(mask)); + Simd4f s3 = gSimd4fOne + splat<3>(weightScale) * (w3 & splat<3>(mask)); + + p0v0 = p0v0 * (gSimd4fOneXYZ | (splat<0>(s0) & sMaskW)); + p0v1 = p0v1 * (gSimd4fOneXYZ | (splat<1>(s0) & sMaskW)); + p0v2 = p0v2 * (gSimd4fOneXYZ | (splat<2>(s0) & sMaskW)); + + p1v0 = p1v0 * (gSimd4fOneXYZ | (splat<0>(s1) & sMaskW)); + p1v1 = p1v1 * (gSimd4fOneXYZ | (splat<1>(s1) & sMaskW)); + p1v2 = p1v2 * (gSimd4fOneXYZ | (splat<2>(s1) & sMaskW)); + + p2v0 = p2v0 * (gSimd4fOneXYZ | (splat<0>(s2) & sMaskW)); + p2v1 = p2v1 * (gSimd4fOneXYZ | (splat<1>(s2) & sMaskW)); + p2v2 = p2v2 * (gSimd4fOneXYZ | (splat<2>(s2) & sMaskW)); + + p3v0 = p3v0 * (gSimd4fOneXYZ | (splat<0>(s3) & sMaskW)); + p3v1 = p3v1 * (gSimd4fOneXYZ | (splat<1>(s3) & sMaskW)); + p3v2 = p3v2 * (gSimd4fOneXYZ | (splat<2>(s3) & sMaskW)); + } + + p0v0 = p0v0 + (splat<0>(rw0) * d0); + p0v1 = p0v1 + (splat<1>(rw0) * d0); + p0v2 = p0v2 + (splat<2>(rw0) * d0); + + p1v0 = p1v0 + (splat<0>(rw1) * d1); + p1v1 = p1v1 + (splat<1>(rw1) * d1); + p1v2 = p1v2 + (splat<2>(rw1) * d1); + + p2v0 = p2v0 + (splat<0>(rw2) * d2); + p2v1 = p2v1 + (splat<1>(rw2) * d2); + p2v2 = p2v2 + (splat<2>(rw2) * d2); + + p3v0 = p3v0 + (splat<0>(rw3) * d3); + p3v1 = p3v1 + (splat<1>(rw3) * d3); + p3v2 = p3v2 + (splat<2>(rw3) * d3); + + // write back particles + storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0); + storeAligned(particles, vpIt[1] * sizeof(PxVec4), p0v1); + storeAligned(particles, vpIt[2] * sizeof(PxVec4), p0v2); + + storeAligned(particles, vpIt[4] * sizeof(PxVec4), p1v0); + storeAligned(particles, vpIt[5] * sizeof(PxVec4), p1v1); + storeAligned(particles, vpIt[6] * sizeof(PxVec4), p1v2); + + storeAligned(particles, vpIt[8] * sizeof(PxVec4), p2v0); + storeAligned(particles, vpIt[9] * sizeof(PxVec4), p2v1); + storeAligned(particles, vpIt[10] * sizeof(PxVec4), p2v2); + + storeAligned(particles, vpIt[12] * sizeof(PxVec4), p3v0); + storeAligned(particles, vpIt[13] * sizeof(PxVec4), p3v1); + storeAligned(particles, vpIt[14] * sizeof(PxVec4), p3v2); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideContinuousParticles() +{ + Simd4f curPos[4]; + Simd4f prevPos[4]; + + const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f; + const Simd4f massScale = simd4f(mClothData.mCollisionMassScale); + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + float* __restrict prevIt = mClothData.mPrevParticles; + float* __restrict curIt = mClothData.mCurParticles; + float* __restrict curEnd = curIt + mClothData.mNumParticles * 4; + + for(; curIt < curEnd; curIt += 16, prevIt += 16) + { + prevPos[0] = loadAligned(prevIt, 0); + prevPos[1] = loadAligned(prevIt, 16); + prevPos[2] = loadAligned(prevIt, 32); + prevPos[3] = loadAligned(prevIt, 48); + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + + curPos[0] = loadAligned(curIt, 0); + curPos[1] = loadAligned(curIt, 16); + curPos[2] = loadAligned(curIt, 32); + curPos[3] = loadAligned(curIt, 48); + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + + ImpulseAccumulator accum; + Simd4i sphereMask = collideCones(prevPos, curPos, accum); + collideSpheres(sphereMask, prevPos, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + if(frictionEnabled) + { + Simd4f frictionImpulse[3]; + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + prevPos[0] = prevPos[0] - frictionImpulse[0]; + prevPos[1] = prevPos[1] - frictionImpulse[1]; + prevPos[2] = prevPos[2] - frictionImpulse[2]; + + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + storeAligned(prevIt, 0, prevPos[0]); + storeAligned(prevIt, 16, prevPos[1]); + storeAligned(prevIt, 32, prevPos[2]); + storeAligned(prevIt, 48, prevPos[3]); + } + + if(massScalingEnabled) + { + // calculate the inverse mass scale based on the collision impulse magnitude + Simd4f dSq = invNumCollisions * invNumCollisions * + (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ); + + Simd4f weightScale = recip(gSimd4fOne + massScale * dSq); + + // scale invmass + curPos[3] = select(mask, curPos[3] * weightScale, curPos[3]); + } + + curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions; + curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions; + curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions; + + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + storeAligned(curIt, 0, curPos[0]); + storeAligned(curIt, 16, curPos[1]); + storeAligned(curIt, 32, curPos[2]); + storeAligned(curIt, 48, curPos[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideConvexes(const IterationState<Simd4f>& state) +{ + if(!mClothData.mNumConvexes) + return; + + // times 2 for plane equation result buffer + Simd4f* planes = static_cast<Simd4f*>(mAllocator.allocate(sizeof(Simd4f) * mClothData.mNumPlanes * 2)); + + const Simd4f* targetPlanes = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionPlanes); + + // generate plane collision data + if(state.mRemainingIterations != 1) + { + // interpolate planes + LerpIterator<Simd4f, const Simd4f*> planeIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionPlanes), + targetPlanes, state.getCurrentAlpha()); + + // todo: normalize plane equations + generatePlanes(planes, planeIter, mClothData.mNumPlanes); + } + else + { + // otherwise use the target planes directly + generatePlanes(planes, targetPlanes, mClothData.mNumPlanes); + } + + Simd4f curPos[4], prevPos[4]; + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + float* __restrict curIt = mClothData.mCurParticles; + float* __restrict curEnd = curIt + mClothData.mNumParticles * 4; + float* __restrict prevIt = mClothData.mPrevParticles; + for(; curIt < curEnd; curIt += 16, prevIt += 16) + { + curPos[0] = loadAligned(curIt, 0); + curPos[1] = loadAligned(curIt, 16); + curPos[2] = loadAligned(curIt, 32); + curPos[3] = loadAligned(curIt, 48); + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + + ImpulseAccumulator accum; + collideConvexes(planes, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + if(frictionEnabled) + { + prevPos[0] = loadAligned(prevIt, 0); + prevPos[1] = loadAligned(prevIt, 16); + prevPos[2] = loadAligned(prevIt, 32); + prevPos[3] = loadAligned(prevIt, 48); + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + + Simd4f frictionImpulse[3]; + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + prevPos[0] = prevPos[0] - frictionImpulse[0]; + prevPos[1] = prevPos[1] - frictionImpulse[1]; + prevPos[2] = prevPos[2] - frictionImpulse[2]; + + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + storeAligned(prevIt, 0, prevPos[0]); + storeAligned(prevIt, 16, prevPos[1]); + storeAligned(prevIt, 32, prevPos[2]); + storeAligned(prevIt, 48, prevPos[3]); + } + + curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions; + curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions; + curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions; + + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + storeAligned(curIt, 0, curPos[0]); + storeAligned(curIt, 16, curPos[1]); + storeAligned(curIt, 32, curPos[2]); + storeAligned(curIt, 48, curPos[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } + + mAllocator.deallocate(planes); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideConvexes(const Simd4f* __restrict planes, Simd4f* __restrict curPos, + ImpulseAccumulator& accum) +{ + Simd4i result = gSimd4iZero; + Simd4i mask4 = gSimd4iOne; + + const Simd4f* __restrict pIt, *pEnd = planes + mClothData.mNumPlanes; + Simd4f* __restrict dIt = const_cast<Simd4f*>(pEnd); + for(pIt = planes; pIt != pEnd; ++pIt, ++dIt) + { + *dIt = splat<3>(*pIt) + curPos[2] * splat<2>(*pIt) + curPos[1] * splat<1>(*pIt) + curPos[0] * splat<0>(*pIt); + result = result | (mask4 & simd4i(*dIt < gSimd4fZero)); + mask4 = mask4 << 1; // todo: shift by Simd4i on consoles + } + + if(allEqual(result, gSimd4iZero)) + return; + + const uint32_t* __restrict cIt = mClothData.mConvexMasks; + const uint32_t* __restrict cEnd = cIt + mClothData.mNumConvexes; + for(; cIt != cEnd; ++cIt) + { + uint32_t mask = *cIt; + mask4 = simd4i(int(mask)); + if(!anyEqual(mask4 & result, mask4, mask4)) + continue; + + uint32_t test = mask - 1; + uint32_t planeIndex = findBitSet(mask & ~test); + Simd4f plane = planes[planeIndex]; + Simd4f planeX = splat<0>(plane); + Simd4f planeY = splat<1>(plane); + Simd4f planeZ = splat<2>(plane); + Simd4f planeD = pEnd[planeIndex]; + while(mask &= test) + { + test = mask - 1; + planeIndex = findBitSet(mask & ~test); + plane = planes[planeIndex]; + Simd4f dist = pEnd[planeIndex]; + Simd4f closer = dist > planeD; + planeX = select(closer, splat<0>(plane), planeX); + planeY = select(closer, splat<1>(plane), planeY); + planeZ = select(closer, splat<2>(plane), planeZ); + planeD = max(dist, planeD); + } + + accum.subtract(planeX, planeY, planeZ, planeD, simd4f(mask4)); + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideTriangles(const IterationState<Simd4f>& state) +{ + if(!mClothData.mNumCollisionTriangles) + return; + + TriangleData* triangles = + static_cast<TriangleData*>(mAllocator.allocate(sizeof(TriangleData) * mClothData.mNumCollisionTriangles)); + + UnalignedIterator<Simd4f, 3> targetTriangles(mClothData.mTargetCollisionTriangles); + + // generate triangle collision data + if(state.mRemainingIterations != 1) + { + // interpolate triangles + LerpIterator<Simd4f, UnalignedIterator<Simd4f, 3> > triangleIter(mClothData.mStartCollisionTriangles, + targetTriangles, state.getCurrentAlpha()); + + generateTriangles<Simd4f>(triangles, triangleIter, mClothData.mNumCollisionTriangles); + } + else + { + // otherwise use the target triangles directly + generateTriangles<Simd4f>(triangles, targetTriangles, mClothData.mNumCollisionTriangles); + } + + Simd4f positions[4]; + + float* __restrict pIt = mClothData.mCurParticles; + float* __restrict pEnd = pIt + mClothData.mNumParticles * 4; + for(; pIt < pEnd; pIt += 16) + { + positions[0] = loadAligned(pIt, 0); + positions[1] = loadAligned(pIt, 16); + positions[2] = loadAligned(pIt, 32); + positions[3] = loadAligned(pIt, 48); + transpose(positions[0], positions[1], positions[2], positions[3]); + + ImpulseAccumulator accum; + collideTriangles(triangles, positions, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, gSimd4fEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + positions[0] = positions[0] + accum.mDeltaX * invNumCollisions; + positions[1] = positions[1] + accum.mDeltaY * invNumCollisions; + positions[2] = positions[2] + accum.mDeltaZ * invNumCollisions; + + transpose(positions[0], positions[1], positions[2], positions[3]); + storeAligned(pIt, 0, positions[0]); + storeAligned(pIt, 16, positions[1]); + storeAligned(pIt, 32, positions[2]); + storeAligned(pIt, 48, positions[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } + + mAllocator.deallocate(triangles); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideTriangles(const TriangleData* __restrict triangles, Simd4f* __restrict curPos, + ImpulseAccumulator& accum) +{ + Simd4f normalX, normalY, normalZ, normalD; + normalX = normalY = normalZ = normalD = gSimd4fZero; + Simd4f minSqrLength = gSimd4fFloatMax; + + const TriangleData* __restrict tIt, *tEnd = triangles + mClothData.mNumCollisionTriangles; + for(tIt = triangles; tIt != tEnd; ++tIt) + { + Simd4f base = loadAligned(&tIt->base.x); + Simd4f edge0 = loadAligned(&tIt->edge0.x); + Simd4f edge1 = loadAligned(&tIt->edge1.x); + Simd4f normal = loadAligned(&tIt->normal.x); + Simd4f aux = loadAligned(&tIt->det); + + Simd4f dx = curPos[0] - splat<0>(base); + Simd4f dy = curPos[1] - splat<1>(base); + Simd4f dz = curPos[2] - splat<2>(base); + + Simd4f e0x = splat<0>(edge0); + Simd4f e0y = splat<1>(edge0); + Simd4f e0z = splat<2>(edge0); + + Simd4f e1x = splat<0>(edge1); + Simd4f e1y = splat<1>(edge1); + Simd4f e1z = splat<2>(edge1); + + Simd4f nx = splat<0>(normal); + Simd4f ny = splat<1>(normal); + Simd4f nz = splat<2>(normal); + + Simd4f deltaDotEdge0 = dx * e0x + dy * e0y + dz * e0z; + Simd4f deltaDotEdge1 = dx * e1x + dy * e1y + dz * e1z; + Simd4f deltaDotNormal = dx * nx + dy * ny + dz * nz; + + Simd4f edge0DotEdge1 = splat<3>(base); + Simd4f edge0SqrLength = splat<3>(edge0); + Simd4f edge1SqrLength = splat<3>(edge1); + + Simd4f s = edge1SqrLength * deltaDotEdge0 - edge0DotEdge1 * deltaDotEdge1; + Simd4f t = edge0SqrLength * deltaDotEdge1 - edge0DotEdge1 * deltaDotEdge0; + + Simd4f sPositive = s > gSimd4fZero; + Simd4f tPositive = t > gSimd4fZero; + + Simd4f det = splat<0>(aux); + + s = select(tPositive, s * det, deltaDotEdge0 * splat<2>(aux)); + t = select(sPositive, t * det, deltaDotEdge1 * splat<3>(aux)); + + Simd4f clamp = gSimd4fOne < s + t; + Simd4f numerator = edge1SqrLength - edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1; + + s = select(clamp, numerator * splat<1>(aux), s); + + s = max(gSimd4fZero, min(gSimd4fOne, s)); + t = max(gSimd4fZero, min(gSimd4fOne - s, t)); + + dx = dx - e0x * s - e1x * t; + dy = dy - e0y * s - e1y * t; + dz = dz - e0z * s - e1z * t; + + Simd4f sqrLength = dx * dx + dy * dy + dz * dz; + + // slightly increase distance for colliding triangles + Simd4f slack = (gSimd4fZero > deltaDotNormal) & simd4f(1e-4f); + sqrLength = sqrLength + sqrLength * slack; + + Simd4f mask = sqrLength < minSqrLength; + + normalX = select(mask, nx, normalX); + normalY = select(mask, ny, normalY); + normalZ = select(mask, nz, normalZ); + normalD = select(mask, deltaDotNormal, normalD); + + minSqrLength = min(sqrLength, minSqrLength); + } + + Simd4f mask; + if(!anyGreater(gSimd4fZero, normalD, mask)) + return; + + accum.subtract(normalX, normalY, normalZ, normalD, mask); +} + +// explicit template instantiation +#if NV_SIMD_SIMD +template class cloth::SwCollision<Simd4f>; +#endif +#if NV_SIMD_SCALAR +template class cloth::SwCollision<Scalar4f>; +#endif + +/* +namespace +{ + using namespace cloth; + + int test() + { + Simd4f vertices[] = { + simd4f(0.0f, 0.0f, 0.0f, 0.0f), + simd4f(0.1f, 0.0f, 0.0f, 0.0f), + simd4f(0.0f, 0.1f, 0.0f, 0.0f) + }; + TriangleData triangle; + generateTriangles<Simd4f>(&triangle, &*vertices, 1); + + char buffer[1000]; + SwKernelAllocator alloc(buffer, 1000); + + SwClothData* cloth = static_cast<SwClothData*>(malloc(sizeof(SwClothData))); + memset(cloth, 0, sizeof(SwClothData)); + cloth->mNumTriangles = 1; + + SwCollision<Simd4f> collision(*cloth, alloc); + SwCollision<Simd4f>::ImpulseAccumulator accum; + + Simd4f particles[4] = {}; + for(float y=-0.1f; y < 0.0f; y += 0.2f) + { + for(float x=-0.1f; x < 0.0f; x += 0.2f) + { + particles[0] = simd4f(x); + particles[1] = simd4f(y); + particles[2] = simd4f(-1.0f); + + collision.collideTriangles(&triangle, particles, accum); + } + } + + return 0; + } + + static int blah = test(); +} +*/ diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.h new file mode 100644 index 00000000..bda3a57b --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCollision.h @@ -0,0 +1,138 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "StackAllocator.h" +#include "Simd.h" + +namespace physx +{ +namespace cloth +{ + +class SwCloth; +struct SwClothData; +template <typename> +struct IterationState; +struct IndexPair; +struct SphereData; +struct ConeData; +struct TriangleData; + +typedef StackAllocator<16> SwKernelAllocator; + +/** + Collision handler for SwSolver. + */ +template <typename Simd4f> +class SwCollision +{ + typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i; + + public: + struct ShapeMask + { + Simd4i mCones; + Simd4i mSpheres; + + ShapeMask& operator=(const ShapeMask&); + ShapeMask& operator&=(const ShapeMask&); + }; + + struct CollisionData + { + CollisionData(); + SphereData* mSpheres; + ConeData* mCones; + }; + + struct ImpulseAccumulator; + + public: + SwCollision(SwClothData& clothData, SwKernelAllocator& alloc); + ~SwCollision(); + + void operator()(const IterationState<Simd4f>& state); + + static size_t estimateTemporaryMemory(const SwCloth& cloth); + static size_t estimatePersistentMemory(const SwCloth& cloth); + + private: + SwCollision& operator=(const SwCollision&); // not implemented + void allocate(CollisionData&); + void deallocate(const CollisionData&); + + void computeBounds(); + + void buildSphereAcceleration(const SphereData*); + void buildConeAcceleration(); + static void mergeAcceleration(uint32_t*); + bool buildAcceleration(); + + static ShapeMask getShapeMask(const Simd4f&, const Simd4i*, const Simd4i*); + ShapeMask getShapeMask(const Simd4f*) const; + ShapeMask getShapeMask(const Simd4f*, const Simd4f*) const; + + void collideSpheres(const Simd4i&, const Simd4f*, ImpulseAccumulator&) const; + Simd4i collideCones(const Simd4f*, ImpulseAccumulator&) const; + + void collideSpheres(const Simd4i&, const Simd4f*, Simd4f*, ImpulseAccumulator&) const; + Simd4i collideCones(const Simd4f*, Simd4f*, ImpulseAccumulator&) const; + + void collideParticles(); + void collideVirtualParticles(); + void collideContinuousParticles(); + + void collideConvexes(const IterationState<Simd4f>&); + void collideConvexes(const Simd4f*, Simd4f*, ImpulseAccumulator&); + + void collideTriangles(const IterationState<Simd4f>&); + void collideTriangles(const TriangleData*, Simd4f*, ImpulseAccumulator&); + + public: + // acceleration structure + static const uint32_t sGridSize = 8; + Simd4i mSphereGrid[6 * sGridSize / 4]; + Simd4i mConeGrid[6 * sGridSize / 4]; + Simd4f mGridScale, mGridBias; + + CollisionData mPrevData; + CollisionData mCurData; + + SwClothData& mClothData; + SwKernelAllocator& mAllocator; + + uint32_t mNumCollisions; + + static const Simd4f sSkeletonWidth; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/SwCollisionHelpers.h new file mode 100644 index 00000000..230685bb --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwCollisionHelpers.h @@ -0,0 +1,84 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Simd.h" + +// platform specific helpers + +namespace physx +{ +namespace cloth +{ + +inline uint32_t findBitSet(uint32_t mask); + +// intFloor(-1.0f) returns -2 on SSE and NEON! +inline Simd4i intFloor(const Simd4f& v); + +inline Simd4i horizontalOr(const Simd4i& mask); + +template <typename> +struct Gather; + +#if NV_SIMD_SIMD +template <> +struct Gather<Simd4i> +{ + inline Gather(const Simd4i& index); + inline Simd4i operator()(const Simd4i*) const; + +#if NV_SIMD_SSE2 + Simd4i mSelectQ, mSelectD, mSelectW; + static const Simd4i sIntSignBit; + static const Simd4i sSignedMask; +#elif NV_SIMD_NEON + Simd4i mPermute; + static const Simd4i sPack; + static const Simd4i sOffset; + static const Simd4i sShift; + static const Simd4i sMask; +#endif + Simd4i mOutOfRange; +}; +#endif + +} // namespace cloth +} // namespace physx + +#if NV_SIMD_SSE2 +#include "sse2/SwCollisionHelpers.h" +#elif NV_SIMD_NEON +#include "neon/SwCollisionHelpers.h" +#endif + +#if NV_SIMD_SCALAR +#include "scalar/SwCollisionHelpers.h" +#endif diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.cpp new file mode 100644 index 00000000..aa7f8356 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.cpp @@ -0,0 +1,177 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxAssert.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "PsSort.h" +#include "limits.h" // for USHRT_MAX +#include "PsUtilities.h" + +using namespace physx; +using namespace shdfnd; + +cloth::SwTether::SwTether(uint16_t anchor, float length) : mAnchor(anchor), mLength(length) +{ +} + +cloth::SwFabric::SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths, + Range<const uint32_t> triangles, uint32_t id) +: mFactory(factory), mNumParticles(numParticles), mTetherLengthScale(1.0f), mId(id) +{ + // should no longer be prefixed with 0 + PX_ASSERT(sets.front() != 0); + +#if PX_WINDOWS + const uint32_t kSimdWidth = 8; // avx +#else + const uint32_t kSimdWidth = 4; +#endif + + // consistency check + PX_ASSERT(sets.back() == restvalues.size()); + PX_ASSERT(restvalues.size() * 2 == indices.size()); + PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end())); + PX_ASSERT(mNumParticles + kSimdWidth - 1 <= USHRT_MAX); + + mPhases.assign(phases.begin(), phases.end()); + mSets.reserve(sets.size() + 1); + mSets.pushBack(0); // prefix with 0 + + mOriginalNumRestvalues = uint32_t(restvalues.size()); + + // padd indices for SIMD + const uint32_t* iBegin = indices.begin(), *iIt = iBegin; + const float* rBegin = restvalues.begin(), *rIt = rBegin; + const uint32_t* sIt, *sEnd = sets.end(); + for(sIt = sets.begin(); sIt != sEnd; ++sIt) + { + const float* rEnd = rBegin + *sIt; + const uint32_t* iEnd = iBegin + *sIt * 2; + uint32_t numConstraints = uint32_t(rEnd - rIt); + + for(; rIt != rEnd; ++rIt) + mRestvalues.pushBack(*rIt); + + for(; iIt != iEnd; ++iIt) + mIndices.pushBack(uint16_t(*iIt)); + + // add dummy indices to make multiple of 4 + for(; numConstraints &= kSimdWidth - 1; ++numConstraints) + { + mRestvalues.pushBack(-FLT_MAX); + uint32_t index = mNumParticles + numConstraints - 1; + mIndices.pushBack(uint16_t(index)); + mIndices.pushBack(uint16_t(index)); + } + + mSets.pushBack(uint32_t(mRestvalues.size())); + } + + // trim overallocations + RestvalueContainer(mRestvalues.begin(), mRestvalues.end()).swap(mRestvalues); + Vector<uint16_t>::Type(mIndices.begin(), mIndices.end()).swap(mIndices); + + // tethers + PX_ASSERT(anchors.size() == tetherLengths.size()); + + // pad to allow for direct 16 byte (unaligned) loads + mTethers.reserve(anchors.size() + 2); + for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront()) + mTethers.pushBack(SwTether(uint16_t(anchors.front()), tetherLengths.front())); + + // triangles + mTriangles.reserve(triangles.size()); + const uint32_t* iEnd = triangles.end(); + for(iIt = triangles.begin(); iIt != iEnd; ++iIt) + mTriangles.pushBack(uint16_t(*iIt)); + + mFactory.mFabrics.pushBack(this); +} + +cloth::SwFabric::~SwFabric() +{ + Vector<SwFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this); + PX_ASSERT(fIt != mFactory.mFabrics.end()); + mFactory.mFabrics.replaceWithLast(fIt); +} + +cloth::Factory& physx::cloth::SwFabric::getFactory() const +{ + return mFactory; +} + +uint32_t cloth::SwFabric::getNumPhases() const +{ + return uint32_t(mPhases.size()); +} + +uint32_t cloth::SwFabric::getNumRestvalues() const +{ + return mOriginalNumRestvalues; +} + +uint32_t cloth::SwFabric::getNumSets() const +{ + return uint32_t(mSets.size() - 1); +} + +uint32_t cloth::SwFabric::getNumIndices() const +{ + return 2 * mOriginalNumRestvalues; +} + +uint32_t cloth::SwFabric::getNumParticles() const +{ + return mNumParticles; +} + +uint32_t physx::cloth::SwFabric::getNumTethers() const +{ + return uint32_t(mTethers.size()); +} + +uint32_t physx::cloth::SwFabric::getNumTriangles() const +{ + return uint32_t(mTriangles.size()) / 3; +} + +void physx::cloth::SwFabric::scaleRestvalues(float scale) +{ + RestvalueContainer::Iterator rIt, rEnd = mRestvalues.end(); + for(rIt = mRestvalues.begin(); rIt != rEnd; ++rIt) + *rIt *= scale; +} + +void physx::cloth::SwFabric::scaleTetherLengths(float scale) +{ + mTetherLengthScale *= scale; +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.h b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.h new file mode 100644 index 00000000..b762bcb0 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFabric.h @@ -0,0 +1,109 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxVec4.h" +#include "Allocator.h" +#include "Fabric.h" +#include "Types.h" +#include "Range.h" + +namespace physx +{ + +namespace cloth +{ + +class SwFactory; + +struct SwTether +{ + SwTether(uint16_t, float); + uint16_t mAnchor; + float mLength; +}; + +class SwFabric : public UserAllocated, public Fabric +{ + public: +#if PX_WINDOWS + typedef AlignedVector<float, 32>::Type RestvalueContainer; // avx +#else + typedef AlignedVector<float, 16>::Type RestvalueContainer; +#endif + + SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths, Range<const uint32_t> triangles, uint32_t id); + + SwFabric& operator=(const SwFabric&); + + virtual ~SwFabric(); + + virtual Factory& getFactory() const; + + virtual uint32_t getNumPhases() const; + virtual uint32_t getNumRestvalues() const; + + virtual uint32_t getNumSets() const; + virtual uint32_t getNumIndices() const; + + virtual uint32_t getNumParticles() const; + + virtual uint32_t getNumTethers() const; + + virtual uint32_t getNumTriangles() const; + + virtual void scaleRestvalues(float); + virtual void scaleTetherLengths(float); + + public: + SwFactory& mFactory; + + uint32_t mNumParticles; + + Vector<uint32_t>::Type mPhases; // index of set to use + Vector<uint32_t>::Type mSets; // offset of first restvalue, with 0 prefix + + RestvalueContainer mRestvalues; // rest values (edge length) + Vector<uint16_t>::Type mIndices; // particle index pairs + + Vector<SwTether>::Type mTethers; + float mTetherLengthScale; + + Vector<uint16_t>::Type mTriangles; + + uint32_t mId; + + uint32_t mOriginalNumRestvalues; + +} PX_ALIGN_SUFFIX(16); +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.cpp new file mode 100644 index 00000000..92f17c98 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.cpp @@ -0,0 +1,297 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxMemory.h" +#include "SwFactory.h" +#include "SwFabric.h" +#include "SwCloth.h" +#include "SwSolver.h" +#include "ClothImpl.h" +#include <string.h> // for memcpy + +using namespace physx; + +namespace physx +{ +namespace cloth +{ +// defined in Factory.cpp +uint32_t getNextFabricId(); +} +} + +cloth::SwFactory::SwFactory() : Factory(CPU) +{ +} + +cloth::SwFactory::~SwFactory() +{ +} + +cloth::Fabric* cloth::SwFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, + Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths, Range<const uint32_t> triangles) +{ + return new SwFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths, triangles, + getNextFabricId()); +} + +cloth::Cloth* cloth::SwFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric) +{ + return new SwClothImpl(*this, fabric, particles); +} + +cloth::Solver* cloth::SwFactory::createSolver(physx::PxTaskManager* taskMgr) +{ +#ifdef PX_PHYSX_GPU_EXPORTS + // SwSolver not defined in PhysXGpu project + PX_UNUSED(taskMgr); + return 0; +#else + return new SwSolver(taskMgr); +#endif +} + +cloth::Cloth* cloth::SwFactory::clone(const Cloth& cloth) +{ + if(cloth.getFactory().getPlatform() != Factory::CPU) + return cloth.clone(*this); // forward to CuCloth + + // copy construct + return new SwClothImpl(*this, static_cast<const SwClothImpl&>(cloth)); +} + +void cloth::SwFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths, Range<uint32_t> triangles) const +{ + const SwFabric& swFabric = static_cast<const SwFabric&>(fabric); + + PX_ASSERT(phases.empty() || phases.size() == swFabric.getNumPhases()); + PX_ASSERT(restvalues.empty() || restvalues.size() == swFabric.getNumRestvalues()); + PX_ASSERT(sets.empty() || sets.size() == swFabric.getNumSets()); + PX_ASSERT(indices.empty() || indices.size() == swFabric.getNumIndices()); + PX_ASSERT(anchors.empty() || anchors.size() == swFabric.getNumTethers()); + PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == swFabric.getNumTethers()); + + for(uint32_t i = 0; !phases.empty(); ++i, phases.popFront()) + phases.front() = swFabric.mPhases[i]; + + const uint32_t* sEnd = swFabric.mSets.end(), *sIt; + const float* rBegin = swFabric.mRestvalues.begin(), *rIt = rBegin; + const uint16_t* iIt = swFabric.mIndices.begin(); + + uint32_t* sDst = sets.begin(); + float* rDst = restvalues.begin(); + uint32_t* iDst = indices.begin(); + + uint32_t numConstraints = 0; + for(sIt = swFabric.mSets.begin(); ++sIt != sEnd;) + { + const float* rEnd = rBegin + *sIt; + for(; rIt != rEnd; ++rIt) + { + uint16_t i0 = *iIt++; + uint16_t i1 = *iIt++; + + if(PxMax(i0, i1) >= swFabric.mNumParticles) + continue; + + if(!restvalues.empty()) + *rDst++ = *rIt; + + if(!indices.empty()) + { + *iDst++ = i0; + *iDst++ = i1; + } + + ++numConstraints; + } + + if(!sets.empty()) + *sDst++ = numConstraints; + } + + for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront()) + anchors.front() = swFabric.mTethers[i].mAnchor; + + for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront()) + tetherLengths.front() = swFabric.mTethers[i].mLength * swFabric.mTetherLengthScale; + + for(uint32_t i = 0; !triangles.empty(); ++i, triangles.popFront()) + triangles.front() = swFabric.mTriangles[i]; +} + +void cloth::SwFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + PX_ASSERT(spheres.empty() || spheres.size() == swCloth.mStartCollisionSpheres.size()); + PX_ASSERT(capsules.empty() || capsules.size() == swCloth.mCapsuleIndices.size() * 2); + PX_ASSERT(planes.empty() || planes.size() == swCloth.mStartCollisionPlanes.size()); + PX_ASSERT(convexes.empty() || convexes.size() == swCloth.mConvexMasks.size()); + PX_ASSERT(triangles.empty() || triangles.size() == swCloth.mStartCollisionTriangles.size()); + + if(!swCloth.mStartCollisionSpheres.empty() && !spheres.empty()) + memcpy(spheres.begin(), &swCloth.mStartCollisionSpheres.front(), + swCloth.mStartCollisionSpheres.size() * sizeof(PxVec4)); + + if(!swCloth.mCapsuleIndices.empty() && !capsules.empty()) + memcpy(capsules.begin(), &swCloth.mCapsuleIndices.front(), swCloth.mCapsuleIndices.size() * sizeof(IndexPair)); + + if(!swCloth.mStartCollisionPlanes.empty() && !planes.empty()) + memcpy(planes.begin(), &swCloth.mStartCollisionPlanes.front(), + swCloth.mStartCollisionPlanes.size() * sizeof(PxVec4)); + + if(!swCloth.mConvexMasks.empty() && !convexes.empty()) + memcpy(convexes.begin(), &swCloth.mConvexMasks.front(), swCloth.mConvexMasks.size() * sizeof(uint32_t)); + + if(!swCloth.mStartCollisionTriangles.empty() && !triangles.empty()) + memcpy(triangles.begin(), &swCloth.mStartCollisionTriangles.front(), + swCloth.mStartCollisionTriangles.size() * sizeof(PxVec3)); +} + +void cloth::SwFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + Vec4fAlignedVector const& srcConstraints = !swCloth.mMotionConstraints.mTarget.empty() + ? swCloth.mMotionConstraints.mTarget + : swCloth.mMotionConstraints.mStart; + + if(!srcConstraints.empty()) + { + // make sure dest array is big enough + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4)); + } +} + +void cloth::SwFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + Vec4fAlignedVector const& srcConstraints = !swCloth.mSeparationConstraints.mTarget.empty() + ? swCloth.mSeparationConstraints.mTarget + : swCloth.mSeparationConstraints.mStart; + + if(!srcConstraints.empty()) + { + // make sure dest array is big enough + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4)); + } +} + +void cloth::SwFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + if(!swCloth.mParticleAccelerations.empty()) + { + // make sure dest array is big enough + PX_ASSERT(destAccelerations.size() == swCloth.mParticleAccelerations.size()); + + memcpy(destAccelerations.begin(), &swCloth.mParticleAccelerations.front(), + swCloth.mParticleAccelerations.size() * sizeof(PxVec4)); + } +} + +void cloth::SwFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> indices, Range<PxVec3> weights) const +{ + PX_ASSERT(this == &cloth.getFactory()); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + uint32_t numIndices = cloth.getNumVirtualParticles(); + uint32_t numWeights = cloth.getNumVirtualParticleWeights(); + + PX_ASSERT(indices.size() == numIndices || indices.empty()); + PX_ASSERT(weights.size() == numWeights || weights.empty()); + + if(weights.size() == numWeights) + { + PxVec3* wDestIt = reinterpret_cast<PxVec3*>(weights.begin()); + + // convert weights from vec4 to vec3 + cloth::Vec4fAlignedVector::ConstIterator wIt = swCloth.mVirtualParticleWeights.begin(); + cloth::Vec4fAlignedVector::ConstIterator wEnd = wIt + numWeights; + + for(; wIt != wEnd; ++wIt, ++wDestIt) + *wDestIt = PxVec3(wIt->x, wIt->y, wIt->z); + + PX_ASSERT(wDestIt == weights.end()); + } + if(indices.size() == numIndices) + { + // convert indices + Vec4u* iDestIt = reinterpret_cast<Vec4u*>(indices.begin()); + Vector<Vec4us>::Type::ConstIterator iIt = swCloth.mVirtualParticleIndices.begin(); + Vector<Vec4us>::Type::ConstIterator iEnd = swCloth.mVirtualParticleIndices.end(); + + uint32_t numParticles = uint32_t(swCloth.mCurParticles.size()); + + for(; iIt != iEnd; ++iIt) + { + // skip dummy indices + if(iIt->x < numParticles) + // byte offset to element index + *iDestIt++ = Vec4u(*iIt); + } + + PX_ASSERT(&array(*iDestIt) == indices.end()); + } +} + +void cloth::SwFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const +{ + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + PX_ASSERT(destIndices.size() == swCloth.mSelfCollisionIndices.size()); + PxMemCopy(destIndices.begin(), swCloth.mSelfCollisionIndices.begin(), destIndices.size() * sizeof(uint32_t)); +} + +void cloth::SwFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const +{ + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + PX_ASSERT(destRestPositions.size() == swCloth.mRestPositions.size()); + PxMemCopy(destRestPositions.begin(), swCloth.mRestPositions.begin(), destRestPositions.size() * sizeof(PxVec4)); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.h b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.h new file mode 100644 index 00000000..154fb965 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwFactory.h @@ -0,0 +1,90 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Factory.h" +#include "Allocator.h" + +namespace physx +{ + +namespace cloth +{ + +class SwFabric; +class SwCloth; +template <typename> +class ClothImpl; + +class SwFactory : public UserAllocated, public Factory +{ + public: + typedef SwFabric FabricType; + typedef ClothImpl<SwCloth> ImplType; + + SwFactory(); + virtual ~SwFactory(); + + virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths, + Range<const uint32_t> triangles); + + virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric); + + virtual Solver* createSolver(physx::PxTaskManager*); + + virtual Cloth* clone(const Cloth& cloth); + + virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths, Range<uint32_t> triangles) const; + + virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const; + + virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const; + + virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const; + + virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const; + + virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const; + + public: + Vector<SwFabric*>::Type mFabrics; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.cpp new file mode 100644 index 00000000..d0c8691a --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.cpp @@ -0,0 +1,714 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxProfiler.h" +#include "foundation/PxMemory.h" +#include "SwInterCollision.h" +#include "SwCollisionHelpers.h" +#include "BoundingBox.h" +#include "PsIntrinsics.h" +#include "PsSort.h" + +using namespace physx; + +namespace +{ + +const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); +const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0)); +const Simd4fScalarFactory sEpsilon = simd4f(FLT_EPSILON); +const Simd4fTupleFactory sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); + +// returns sorted indices, output needs to be at least 2*(last-first)+1024 +void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out) +{ + uint32_t n = uint32_t(last - first); + + uint32_t* buffer = out + 2 * n; + uint32_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 }; + + PxMemZero(buffer, 1024 * sizeof(uint32_t)); + + // build 3 histograms in one pass + for(const uint32_t* __restrict it = first; it != last; ++it) + { + uint32_t key = *it; + ++histograms[0][0xff & key]; + ++histograms[1][0xff & (key >> 8)]; + ++histograms[2][0xff & (key >> 16)]; + ++histograms[3][key >> 24]; + } + + // convert histograms to offset tables in-place + uint32_t sums[4] = {}; + for(uint32_t i = 0; i < 256; ++i) + { + uint32_t temp0 = histograms[0][i] + sums[0]; + histograms[0][i] = sums[0]; + sums[0] = temp0; + + uint32_t temp1 = histograms[1][i] + sums[1]; + histograms[1][i] = sums[1]; + sums[1] = temp1; + + uint32_t temp2 = histograms[2][i] + sums[2]; + histograms[2][i] = sums[2]; + sums[2] = temp2; + + uint32_t temp3 = histograms[3][i] + sums[3]; + histograms[3][i] = sums[3]; + sums[3] = temp3; + } + + PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n); + +#if PX_DEBUG + memset(out, 0xff, 2 * n * sizeof(uint32_t)); +#endif + + // sort 8 bits per pass + + uint32_t* __restrict indices[] = { out, out + n }; + + for(uint32_t i = 0; i != n; ++i) + indices[1][histograms[0][0xff & first[i]]++] = i; + + for(uint32_t i = 0, index; i != n; ++i) + { + index = indices[1][i]; + indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index; + } + + for(uint32_t i = 0, index; i != n; ++i) + { + index = indices[0][i]; + indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index; + } + + for(uint32_t i = 0, index; i != n; ++i) + { + index = indices[1][i]; + indices[0][histograms[3][first[index] >> 24]++] = index; + } +} + +template <typename Simd4f> +uint32_t longestAxis(const Simd4f& edgeLength) +{ + const float* e = array(edgeLength); + + if(e[0] > e[1]) + return uint32_t(e[0] > e[2] ? 0 : 2); + else + return uint32_t(e[1] > e[2] ? 1 : 2); +} +} + +template <typename Simd4f> +cloth::SwInterCollision<Simd4f>::SwInterCollision(const cloth::SwInterCollisionData* instances, uint32_t n, + float colDist, float stiffness, uint32_t iterations, + InterCollisionFilter filter, cloth::SwKernelAllocator& alloc) +: mInstances(instances) +, mNumInstances(n) +, mClothIndices(NULL) +, mParticleIndices(NULL) +, mNumParticles(0) +, mTotalParticles(0) +, mFilter(filter) +, mAllocator(alloc) +{ + PX_ASSERT(mFilter); + + mCollisionDistance = simd4f(colDist, colDist, colDist, 0.0f); + mCollisionSquareDistance = mCollisionDistance * mCollisionDistance; + mStiffness = simd4f(stiffness); + mNumIterations = iterations; + + // calculate particle size + for(uint32_t i = 0; i < n; ++i) + mTotalParticles += instances[i].mNumParticles; +} + +template <typename Simd4f> +cloth::SwInterCollision<Simd4f>::~SwInterCollision() +{ +} + +namespace +{ +// multiple x by m leaving w component of x intact +template <typename Simd4f> +PX_INLINE Simd4f transform(const Simd4f m[4], const Simd4f& x) +{ + const Simd4f a = m[3] + splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2]; + return select(sMaskXYZ, a, x); +} + +// rotate x by m leaving w component intact +template <typename Simd4f> +PX_INLINE Simd4f rotate(const Simd4f m[4], const Simd4f& x) +{ + const Simd4f a = splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2]; + return select(sMaskXYZ, a, x); +} + +template <typename Simd4f> +struct ClothSorter +{ + typedef cloth::BoundingBox<Simd4f> BoundingBox; + + ClothSorter(BoundingBox* bounds, uint32_t n, uint32_t axis) : mBounds(bounds), mNumBounds(n), mAxis(axis) + { + } + + bool operator()(uint32_t i, uint32_t j) const + { + PX_ASSERT(i < mNumBounds); + PX_ASSERT(j < mNumBounds); + + return array(mBounds[i].mLower)[mAxis] < array(mBounds[j].mLower)[mAxis]; + } + + BoundingBox* mBounds; + uint32_t mNumBounds; + uint32_t mAxis; +}; + +// for the given cloth array this function calculates the set of particles +// which potentially interact, the potential colliders are returned with their +// cloth index and particle index in clothIndices and particleIndices, the +// function returns the number of potential colliders +template <typename Simd4f> +uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, const cloth::SwInterCollisionData* cEnd, + const Simd4f& colDist, uint16_t* clothIndices, uint32_t* particleIndices, + cloth::BoundingBox<Simd4f>& bounds, uint32_t* overlapMasks, + cloth::InterCollisionFilter filter, cloth::SwKernelAllocator& allocator) +{ + using namespace cloth; + + typedef BoundingBox<Simd4f> BoundingBox; + + uint32_t numParticles = 0; + const uint32_t numCloths = uint32_t(cEnd - cBegin); + + // bounds of each cloth objects in world space + BoundingBox* const clothBounds = static_cast<BoundingBox*>(allocator.allocate(numCloths * sizeof(BoundingBox))); + BoundingBox* const overlapBounds = static_cast<BoundingBox*>(allocator.allocate(numCloths * sizeof(BoundingBox))); + + // union of all cloth world bounds + BoundingBox totalClothBounds = emptyBounds<Simd4f>(); + + uint32_t* sortedIndices = static_cast<uint32_t*>(allocator.allocate(numCloths * sizeof(uint32_t))); + + for(uint32_t i = 0; i < numCloths; ++i) + { + const SwInterCollisionData& c = cBegin[i]; + + // transform bounds from b local space to local space of a + PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0])); + PX_ASSERT(!lcBounds.isEmpty()); + PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose, lcBounds); + + BoundingBox cBounds = { simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f), + simd4f(cWorld.maximum.x, cWorld.maximum.y, cWorld.maximum.z, 0.0f) }; + + sortedIndices[i] = i; + clothBounds[i] = cBounds; + + totalClothBounds = expandBounds(totalClothBounds, cBounds); + } + + // sort indices by their minimum extent on the longest axis + const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower); + + ClothSorter<Simd4f> predicate(clothBounds, numCloths, sweepAxis); + shdfnd::sort(sortedIndices, numCloths, predicate); + + for(uint32_t i = 0; i < numCloths; ++i) + { + PX_ASSERT(sortedIndices[i] < numCloths); + + const SwInterCollisionData& a = cBegin[sortedIndices[i]]; + + // local bounds + const Simd4f aCenter = load(reinterpret_cast<const float*>(&a.mBoundsCenter)); + const Simd4f aHalfExtent = load(reinterpret_cast<const float*>(&a.mBoundsHalfExtent)) + colDist; + const BoundingBox aBounds = { aCenter - aHalfExtent, aCenter + aHalfExtent }; + + const PxMat44 aToWorld(a.mGlobalPose); + const PxTransform aToLocal(a.mGlobalPose.getInverse()); + + const float axisMin = array(clothBounds[sortedIndices[i]].mLower)[sweepAxis]; + const float axisMax = array(clothBounds[sortedIndices[i]].mUpper)[sweepAxis]; + + uint32_t overlapMask = 0; + uint32_t numOverlaps = 0; + + // scan back to find first intersecting bounding box + uint32_t startIndex = i; + while(startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin) + --startIndex; + + // compute all overlapping bounds + for(uint32_t j = startIndex; j < numCloths; ++j) + { + // ignore self-collision + if(i == j) + continue; + + // early out if no more cloths along axis intersect us + if(array(clothBounds[sortedIndices[j]].mLower)[sweepAxis] > axisMax) + break; + + const SwInterCollisionData& b = cBegin[sortedIndices[j]]; + + // check if collision between these shapes is filtered + if(!filter(a.mUserData, b.mUserData)) + continue; + + // set mask bit for this cloth + overlapMask |= 1 << sortedIndices[j]; + + // transform bounds from b local space to local space of a + PxBounds3 lcBounds = + PxBounds3::centerExtents(b.mBoundsCenter, b.mBoundsHalfExtent + PxVec3(array(colDist)[0])); + PX_ASSERT(!lcBounds.isEmpty()); + PxBounds3 bLocal = PxBounds3::transformFast(aToLocal * b.mGlobalPose, lcBounds); + + BoundingBox bBounds = { simd4f(bLocal.minimum.x, bLocal.minimum.y, bLocal.minimum.z, 0.0f), + simd4f(bLocal.maximum.x, bLocal.maximum.y, bLocal.maximum.z, 0.0f) }; + + BoundingBox iBounds = intersectBounds(aBounds, bBounds); + + // setup bounding box w to make point containment test cheaper + Simd4f floatMax = gSimd4fFloatMax & static_cast<Simd4f>(sMaskW); + iBounds.mLower = (iBounds.mLower & sMaskXYZ) | -floatMax; + iBounds.mUpper = (iBounds.mUpper & sMaskXYZ) | floatMax; + + if(!isEmptyBounds(iBounds)) + overlapBounds[numOverlaps++] = iBounds; + } + + //---------------------------------------------------------------- + // cull all particles to overlapping bounds and transform particles to world space + + const uint32_t clothIndex = sortedIndices[i]; + overlapMasks[clothIndex] = overlapMask; + + Simd4f* pBegin = reinterpret_cast<Simd4f*>(a.mParticles); + Simd4f* qBegin = reinterpret_cast<Simd4f*>(a.mPrevParticles); + + const Simd4f xform[4] = { load(reinterpret_cast<const float*>(&aToWorld.column0)), + load(reinterpret_cast<const float*>(&aToWorld.column1)), + load(reinterpret_cast<const float*>(&aToWorld.column2)), + load(reinterpret_cast<const float*>(&aToWorld.column3)) }; + + Simd4f impulseInvScale = recip(Simd4f(simd4f(cBegin[clothIndex].mImpulseScale))); + + for(uint32_t k = 0; k < a.mNumParticles; ++k) + { + Simd4f* pIt = a.mIndices ? pBegin + a.mIndices[k] : pBegin + k; + Simd4f* qIt = a.mIndices ? qBegin + a.mIndices[k] : qBegin + k; + + const Simd4f p = *pIt; + + for(const BoundingBox* oIt = overlapBounds, *oEnd = overlapBounds + numOverlaps; oIt != oEnd; ++oIt) + { + // point in box test + if(anyGreater(oIt->mLower, p) != 0) + continue; + if(anyGreater(p, oIt->mUpper) != 0) + continue; + + // transform particle to world space in-place + // (will be transformed back after collision) + *pIt = transform(xform, p); + + Simd4f impulse = (p - *qIt) * impulseInvScale; + *qIt = rotate(xform, impulse); + + // update world bounds + bounds = expandBounds(bounds, pIt, pIt + 1); + + // add particle to output arrays + clothIndices[numParticles] = uint16_t(clothIndex); + particleIndices[numParticles] = uint32_t(pIt - pBegin); + + // output each particle only once + ++numParticles; + break; + } + } + } + + allocator.deallocate(sortedIndices); + allocator.deallocate(overlapBounds); + allocator.deallocate(clothBounds); + + return numParticles; +} +} + +template <typename Simd4f> +PX_INLINE Simd4f& cloth::SwInterCollision<Simd4f>::getParticle(uint32_t index) +{ + PX_ASSERT(index < mNumParticles); + + uint16_t clothIndex = mClothIndices[index]; + uint32_t particleIndex = mParticleIndices[index]; + + PX_ASSERT(clothIndex < mNumInstances); + + return reinterpret_cast<Simd4f&>(mInstances[clothIndex].mParticles[particleIndex]); +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::operator()() +{ + mNumTests = mNumCollisions = 0; + + mClothIndices = static_cast<uint16_t*>(mAllocator.allocate(sizeof(uint16_t) * mTotalParticles)); + mParticleIndices = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t) * mTotalParticles)); + mOverlapMasks = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t*) * mNumInstances)); + + for(uint32_t k = 0; k < mNumIterations; ++k) + { + // world bounds of particles + BoundingBox<Simd4f> bounds = emptyBounds<Simd4f>(); + + // calculate potentially colliding set + { + PX_PROFILE_ZONE("cloth::SwInterCollision::BroadPhase", 0); + + mNumParticles = + calculatePotentialColliders(mInstances, mInstances + mNumInstances, mCollisionDistance, mClothIndices, + mParticleIndices, bounds, mOverlapMasks, mFilter, mAllocator); + } + + // collide + if(mNumParticles) + { + PX_PROFILE_ZONE("cloth::SwInterCollision::Collide", 0); + + Simd4f lowerBound = bounds.mLower; + Simd4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon); + + // sweep along longest axis + uint32_t sweepAxis = longestAxis(edgeLength); + uint32_t hashAxis0 = (sweepAxis + 1) % 3; + uint32_t hashAxis1 = (sweepAxis + 2) % 3; + + // reserve 0, 127, and 65535 for sentinel + Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength); + array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533; + + Simd4f one = gSimd4fOne; + Simd4f gridSize = simd4f(254.0f); + array(gridSize)[sweepAxis] = 65534.0f; + + Simd4f gridScale = recip<1>(cellSize); + Simd4f gridBias = -lowerBound * gridScale + one; + + void* buffer = mAllocator.allocate(getBufferSize(mNumParticles)); + + uint32_t* __restrict sortedIndices = reinterpret_cast<uint32_t*>(buffer); + uint32_t* __restrict sortedKeys = sortedIndices + mNumParticles; + uint32_t* __restrict keys = PxMax(sortedKeys + mNumParticles, sortedIndices + 2 * mNumParticles + 1024); + + typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i; + + // create keys + for(uint32_t i = 0; i < mNumParticles; ++i) + { + // grid coordinate + Simd4f indexf = getParticle(i) * gridScale + gridBias; + + // need to clamp index because shape collision potentially + // pushes particles outside of their original bounds + Simd4i indexi = intFloor(max(one, min(indexf, gridSize))); + + const int32_t* ptr = array(indexi); + keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24)); + } + + // compute sorted keys indices + radixSort(keys, keys + mNumParticles, sortedIndices); + + // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel) + uint32_t firstColumnSize = sortedIndices[2 * mNumParticles + 769]; + + // sort keys + for(uint32_t i = 0; i < mNumParticles; ++i) + sortedKeys[i] = keys[sortedIndices[i]]; + sortedKeys[mNumParticles] = uint32_t(-1); // sentinel + + // calculate the number of buckets we need to search forward + const Simd4i data = intFloor(gridScale * mCollisionDistance); + uint32_t collisionDistance = uint32_t(2 + array(data)[sweepAxis]); + + // collide particles + collideParticles(sortedKeys, firstColumnSize, sortedIndices, mNumParticles, collisionDistance); + + mAllocator.deallocate(buffer); + } + + /* + // verify against brute force (disable collision response when testing) + uint32_t numCollisions = mNumCollisions; + mNumCollisions = 0; + + for(uint32_t i = 0; i < mNumParticles; ++i) + for(uint32_t j = i+1; j < mNumParticles; ++j) + if (mOverlapMasks[mClothIndices[i]] & (1 << mClothIndices[j])) + collideParticles(getParticle(i), getParticle(j)); + + static uint32_t iter = 0; ++iter; + if(numCollisions != mNumCollisions) + printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions); + */ + + // transform back to local space + { + PX_PROFILE_ZONE("cloth::SwInterCollision::PostTransform", 0); + + Simd4f toLocal[4], impulseScale; + uint16_t lastCloth = uint16_t(0xffff); + + for(uint32_t i = 0; i < mNumParticles; ++i) + { + uint16_t clothIndex = mClothIndices[i]; + const SwInterCollisionData* instance = mInstances + clothIndex; + + // todo: could pre-compute these inverses + if(clothIndex != lastCloth) + { + const PxMat44 xform(instance->mGlobalPose.getInverse()); + + toLocal[0] = load(reinterpret_cast<const float*>(&xform.column0)); + toLocal[1] = load(reinterpret_cast<const float*>(&xform.column1)); + toLocal[2] = load(reinterpret_cast<const float*>(&xform.column2)); + toLocal[3] = load(reinterpret_cast<const float*>(&xform.column3)); + + impulseScale = simd4f(instance->mImpulseScale); + + lastCloth = mClothIndices[i]; + } + + uint32_t particleIndex = mParticleIndices[i]; + Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]); + Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]); + + particle = transform(toLocal, particle); + // avoid w becoming negative due to numerical inaccuracies + impulse = max(sZeroW, particle - rotate(toLocal, Simd4f(impulse * impulseScale))); + } + } + } + + mAllocator.deallocate(mOverlapMasks); + mAllocator.deallocate(mParticleIndices); + mAllocator.deallocate(mClothIndices); +} + +template <typename Simd4f> +size_t cloth::SwInterCollision<Simd4f>::estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n) +{ + // count total particles + uint32_t numParticles = 0; + for(uint32_t i = 0; i < n; ++i) + numParticles += cloths[i].mNumParticles; + + uint32_t boundsSize = 2 * n * sizeof(BoundingBox<Simd4f>) + n * sizeof(uint32_t); + uint32_t clothIndicesSize = numParticles * sizeof(uint16_t); + uint32_t particleIndicesSize = numParticles * sizeof(uint32_t); + uint32_t masksSize = n * sizeof(uint32_t); + + return boundsSize + clothIndicesSize + particleIndicesSize + masksSize + getBufferSize(numParticles); +} + +template <typename Simd4f> +size_t physx::cloth::SwInterCollision<Simd4f>::getBufferSize(uint32_t numParticles) +{ + uint32_t keysSize = numParticles * sizeof(uint32_t); + uint32_t indicesSize = numParticles * sizeof(uint32_t); + uint32_t histogramSize = 1024 * sizeof(uint32_t); + + return keysSize + indicesSize + PxMax(indicesSize + histogramSize, keysSize); +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::collideParticle(uint32_t index) +{ + uint16_t clothIndex = mClothIndices[index]; + + if((1 << clothIndex) & ~mClothMask) + return; + + const SwInterCollisionData* instance = mInstances + clothIndex; + + uint32_t particleIndex = mParticleIndices[index]; + Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]); + + Simd4f diff = particle - mParticle; + Simd4f distSqr = dot3(diff, diff); + +#if PX_DEBUG + ++mNumTests; +#endif + + if(allGreater(distSqr, mCollisionSquareDistance)) + return; + + Simd4f w0 = splat<3>(mParticle); + Simd4f w1 = splat<3>(particle); + + Simd4f ratio = mCollisionDistance * rsqrt<1>(distSqr); + Simd4f scale = mStiffness * recip<1>(sEpsilon + w0 + w1); + Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ; + + mParticle = mParticle + delta * w0; + particle = particle - delta * w1; + + Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]); + + mImpulse = mImpulse + delta * w0; + impulse = impulse - delta * w1; + +#if PX_DEBUG || PX_PROFILE + ++mNumCollisions; +#endif +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::collideParticles(const uint32_t* keys, uint32_t firstColumnSize, + const uint32_t* indices, uint32_t numParticles, + uint32_t collisionDistance) +{ + const uint32_t bucketMask = uint16_t(-1); + + const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 }; + + const uint32_t* __restrict kFirst[5]; + const uint32_t* __restrict kLast[5]; + + { + // optimization: scan forward iterator starting points once instead of 9 times + const uint32_t* __restrict kIt = keys; + + uint32_t key = *kIt; + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + kFirst[0] = kIt; + while(*kIt < lastKey) + ++kIt; + kLast[0] = kIt; + + for(uint32_t k = 1; k < 5; ++k) + { + for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kFirst[k] = kIt; + + for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kLast[k] = kIt; + + // jump forward once to second column + kIt = keys + firstColumnSize; + firstColumnSize = 0; + } + } + + const uint32_t* __restrict iIt = indices; + const uint32_t* __restrict iEnd = indices + numParticles; + + const uint32_t* __restrict jIt; + const uint32_t* __restrict jEnd; + + for(; iIt != iEnd; ++iIt, ++kFirst[0]) + { + // load current particle once outside of inner loop + uint32_t index = *iIt; + PX_ASSERT(index < mNumParticles); + mClothIndex = mClothIndices[index]; + PX_ASSERT(mClothIndex < mNumInstances); + mClothMask = mOverlapMasks[mClothIndex]; + + const SwInterCollisionData* instance = mInstances + mClothIndex; + + mParticleIndex = mParticleIndices[index]; + mParticle = reinterpret_cast<const Simd4f&>(instance->mParticles[mParticleIndex]); + mImpulse = reinterpret_cast<const Simd4f&>(instance->mPrevParticles[mParticleIndex]); + + uint32_t key = *kFirst[0]; + + // range of keys we need to check against for this particle + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + // scan forward end point + while(*kLast[0] < lastKey) + ++kLast[0]; + + // process potential colliders of same cell + jEnd = indices + (kLast[0] - keys); + for(jIt = iIt + 1; jIt != jEnd; ++jIt) + collideParticle(*jIt); + + // process neighbor cells + for(uint32_t k = 1; k < 5; ++k) + { + // scan forward start point + for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;) + ++kFirst[k]; + + // scan forward end point + for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;) + ++kLast[k]; + + // process potential colliders + jEnd = indices + (kLast[k] - keys); + for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt) + collideParticle(*jIt); + } + + // write back particle and impulse + reinterpret_cast<Simd4f&>(instance->mParticles[mParticleIndex]) = mParticle; + reinterpret_cast<Simd4f&>(instance->mPrevParticles[mParticleIndex]) = mImpulse; + } +} + +// explicit template instantiation +#if NV_SIMD_SIMD +template class cloth::SwInterCollision<Simd4f>; +#endif +#if NV_SIMD_SCALAR +template class cloth::SwInterCollision<Scalar4f>; +#endif diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.h new file mode 100644 index 00000000..7488f62c --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwInterCollision.h @@ -0,0 +1,144 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "StackAllocator.h" +#include "Simd.h" + +#include "foundation/PxMat44.h" +#include "foundation/PxTransform.h" +#include "foundation/PxBounds3.h" + +namespace physx +{ +namespace cloth +{ + +class SwCloth; +struct SwClothData; + +typedef StackAllocator<16> SwKernelAllocator; + +typedef bool (*InterCollisionFilter)(void* cloth0, void* cloth1); + +struct SwInterCollisionData +{ + SwInterCollisionData() + { + } + SwInterCollisionData(PxVec4* particles, PxVec4* prevParticles, uint32_t numParticles, uint32_t* indices, + const PxTransform& globalPose, const PxVec3& boundsCenter, const PxVec3& boundsHalfExtents, + float impulseScale, void* userData) + : mParticles(particles) + , mPrevParticles(prevParticles) + , mNumParticles(numParticles) + , mIndices(indices) + , mGlobalPose(globalPose) + , mBoundsCenter(boundsCenter) + , mBoundsHalfExtent(boundsHalfExtents) + , mImpulseScale(impulseScale) + , mUserData(userData) + { + } + + PxVec4* mParticles; + PxVec4* mPrevParticles; + uint32_t mNumParticles; + uint32_t* mIndices; + PxTransform mGlobalPose; + PxVec3 mBoundsCenter; + PxVec3 mBoundsHalfExtent; + float mImpulseScale; + void* mUserData; +}; + +template <typename Simd4f> +class SwInterCollision +{ + + public: + SwInterCollision(const SwInterCollisionData* cloths, uint32_t n, float colDist, float stiffness, + uint32_t iterations, InterCollisionFilter filter, cloth::SwKernelAllocator& alloc); + + ~SwInterCollision(); + + void operator()(); + + static size_t estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n); + + private: + SwInterCollision& operator=(const SwInterCollision&); // not implemented + + static size_t getBufferSize(uint32_t); + + void collideParticles(const uint32_t* keys, uint32_t firstColumnSize, const uint32_t* sortedIndices, + uint32_t numParticles, uint32_t collisionDistance); + + Simd4f& getParticle(uint32_t index); + + // better wrap these in a struct + void collideParticle(uint32_t index); + + Simd4f mParticle; + Simd4f mImpulse; + + Simd4f mCollisionDistance; + Simd4f mCollisionSquareDistance; + Simd4f mStiffness; + + uint16_t mClothIndex; + uint32_t mClothMask; + uint32_t mParticleIndex; + + uint32_t mNumIterations; + + const SwInterCollisionData* mInstances; + uint32_t mNumInstances; + + uint16_t* mClothIndices; + uint32_t* mParticleIndices; + uint32_t mNumParticles; + uint32_t* mOverlapMasks; + + uint32_t mTotalParticles; + + InterCollisionFilter mFilter; + + SwKernelAllocator& mAllocator; + + public: + mutable uint32_t mNumTests; + mutable uint32_t mNumCollisions; +}; + +} // namespace cloth + +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.cpp new file mode 100644 index 00000000..122de902 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.cpp @@ -0,0 +1,426 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxMemory.h" +#include "SwSelfCollision.h" +#include "SwCloth.h" +#include "SwClothData.h" +#include "SwCollisionHelpers.h" + +using namespace physx; + +namespace +{ + +const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); + +// returns sorted indices, output needs to be at least 2*(last-first)+1024 +void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out) +{ + uint16_t n = uint16_t(last - first); + + uint16_t* buffer = out + 2 * n; + uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 }; + + PxMemZero(buffer, 1024 * sizeof(uint16_t)); + + // build 3 histograms in one pass + for(const uint32_t* __restrict it = first; it != last; ++it) + { + uint32_t key = *it; + ++histograms[0][0xff & key]; + ++histograms[1][0xff & (key >> 8)]; + ++histograms[2][0xff & (key >> 16)]; + ++histograms[3][key >> 24]; + } + + // convert histograms to offset tables in-place + uint16_t sums[4] = {}; + for(uint32_t i = 0; i < 256; ++i) + { + uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]); + histograms[0][i] = sums[0]; + sums[0] = temp0; + + uint16_t temp1 = uint16_t(histograms[1][i] + sums[1]); + histograms[1][i] = sums[1]; + sums[1] = temp1; + + uint16_t temp2 = uint16_t(histograms[2][i] + sums[2]); + histograms[2][i] = sums[2]; + sums[2] = temp2; + + uint16_t temp3 = uint16_t(histograms[3][i] + sums[3]); + histograms[3][i] = sums[3]; + sums[3] = temp3; + } + + PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n); + +#if PX_DEBUG + memset(out, 0xff, 2 * n * sizeof(uint16_t)); +#endif + + // sort 8 bits per pass + + uint16_t* __restrict indices[] = { out, out + n }; + + for(uint16_t i = 0; i != n; ++i) + indices[1][histograms[0][0xff & first[i]]++] = i; + + for(uint16_t i = 0, index; i != n; ++i) + { + index = indices[1][i]; + indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index; + } + + for(uint16_t i = 0, index; i != n; ++i) + { + index = indices[0][i]; + indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index; + } + + for(uint16_t i = 0, index; i != n; ++i) + { + index = indices[1][i]; + indices[0][histograms[3][first[index] >> 24]++] = index; + } +} + +template <typename Simd4f> +uint32_t longestAxis(const Simd4f& edgeLength) +{ + const float* e = array(edgeLength); + + if(e[0] > e[1]) + return uint32_t(e[0] > e[2] ? 0 : 2); + else + return uint32_t(e[1] > e[2] ? 1 : 2); +} + +bool isSelfCollisionEnabled(const cloth::SwClothData& cloth) +{ + return PxMin(cloth.mSelfCollisionDistance, cloth.mSelfCollisionStiffness) > 0.0f; +} + +bool isSelfCollisionEnabled(const cloth::SwCloth& cloth) +{ + return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f; +} + +inline uint32_t align2(uint32_t x) +{ + return (x + 1) & ~1; +} + +} // anonymous namespace + +template <typename Simd4f> +cloth::SwSelfCollision<Simd4f>::SwSelfCollision(cloth::SwClothData& clothData, cloth::SwKernelAllocator& alloc) +: mClothData(clothData), mAllocator(alloc) +{ + mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance); + mCollisionSquareDistance = mCollisionDistance * mCollisionDistance; + mStiffness = sMaskXYZ & static_cast<Simd4f>(simd4f(mClothData.mSelfCollisionStiffness)); +} + +template <typename Simd4f> +cloth::SwSelfCollision<Simd4f>::~SwSelfCollision() +{ +} + +template <typename Simd4f> +void cloth::SwSelfCollision<Simd4f>::operator()() +{ + mNumTests = mNumCollisions = 0; + + if(!isSelfCollisionEnabled(mClothData)) + return; + + Simd4f lowerBound = load(mClothData.mCurBounds); + Simd4f edgeLength = max(load(mClothData.mCurBounds + 3) - lowerBound, gSimd4fEpsilon); + + // sweep along longest axis + uint32_t sweepAxis = longestAxis(edgeLength); + uint32_t hashAxis0 = (sweepAxis + 1) % 3; + uint32_t hashAxis1 = (sweepAxis + 2) % 3; + + // reserve 0, 127, and 65535 for sentinel + Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength); + array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533; + + Simd4f one = gSimd4fOne; + Simd4f gridSize = simd4f(254.0f); + array(gridSize)[sweepAxis] = 65534.0f; + + Simd4f gridScale = recip<1>(cellSize); + Simd4f gridBias = -lowerBound * gridScale + one; + + uint32_t numIndices = mClothData.mNumSelfCollisionIndices; + void* buffer = mAllocator.allocate(getBufferSize(numIndices)); + + const uint32_t* __restrict indices = mClothData.mSelfCollisionIndices; + uint32_t* __restrict keys = reinterpret_cast<uint32_t*>(buffer); + uint16_t* __restrict sortedIndices = reinterpret_cast<uint16_t*>(keys + numIndices); + uint32_t* __restrict sortedKeys = reinterpret_cast<uint32_t*>(sortedIndices + align2(numIndices)); + + const Simd4f* particles = reinterpret_cast<const Simd4f*>(mClothData.mCurParticles); + + // create keys + for(uint32_t i = 0; i < numIndices; ++i) + { + uint32_t index = indices ? indices[i] : i; + + // grid coordinate + Simd4f keyf = particles[index] * gridScale + gridBias; + + // need to clamp index because shape collision potentially + // pushes particles outside of their original bounds + Simd4i keyi = intFloor(max(one, min(keyf, gridSize))); + + const int32_t* ptr = array(keyi); + keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24)); + } + + // compute sorted keys indices + radixSort(keys, keys + numIndices, sortedIndices); + + // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel) + uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769]; + + // sort keys + for(uint32_t i = 0; i < numIndices; ++i) + sortedKeys[i] = keys[sortedIndices[i]]; + sortedKeys[numIndices] = uint32_t(-1); // sentinel + + if(indices) + { + // sort indices (into no-longer-needed keys array) + const uint16_t* __restrict permutation = sortedIndices; + sortedIndices = reinterpret_cast<uint16_t*>(keys); + for(uint32_t i = 0; i < numIndices; ++i) + sortedIndices[i] = uint16_t(indices[permutation[i]]); + } + + // calculate the number of buckets we need to search forward + const Simd4i data = intFloor(gridScale * mCollisionDistance); + uint32_t collisionDistance = 2 + static_cast<uint32_t>(array(data)[sweepAxis]); + + // collide particles + if(mClothData.mRestPositions) + collideParticles<true>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance); + else + collideParticles<false>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance); + + mAllocator.deallocate(buffer); + + // verify against brute force (disable collision response when testing) + /* + uint32_t numCollisions = mNumCollisions; + mNumCollisions = 0; + + Simd4f* qarticles = reinterpret_cast< + Simd4f*>(mClothData.mCurParticles); + for(uint32_t i = 0; i < numIndices; ++i) + { + uint32_t indexI = indices ? indices[i] : i; + for(uint32_t j = i+1; j < numIndices; ++j) + { + uint32_t indexJ = indices ? indices[j] : j; + collideParticles(qarticles[indexI], qarticles[indexJ]); + } + } + + static uint32_t iter = 0; ++iter; + if(numCollisions != mNumCollisions) + printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions); + */ +} + +template <typename Simd4f> +size_t cloth::SwSelfCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth) +{ + uint32_t numIndices = + cloth.mSelfCollisionIndices.empty() ? cloth.mCurParticles.size() : cloth.mSelfCollisionIndices.size(); + return isSelfCollisionEnabled(cloth) ? getBufferSize(numIndices) : 0; +} + +template <typename Simd4f> +size_t physx::cloth::SwSelfCollision<Simd4f>::getBufferSize(uint32_t numIndices) +{ + uint32_t keysSize = numIndices * sizeof(uint32_t); + uint32_t indicesSize = align2(numIndices) * sizeof(uint16_t); + uint32_t radixSize = (numIndices + 1024) * sizeof(uint16_t); + return keysSize + indicesSize + PxMax(radixSize, keysSize + uint32_t(sizeof(uint32_t))); +} + +template <typename Simd4f> +template <bool useRestParticles> +void cloth::SwSelfCollision<Simd4f>::collideParticles(Simd4f& pos0, Simd4f& pos1, const Simd4f& pos0rest, + const Simd4f& pos1rest) +{ + Simd4f diff = pos1 - pos0; + Simd4f distSqr = dot3(diff, diff); + +#if PX_DEBUG + ++mNumTests; +#endif + + if(allGreater(distSqr, mCollisionSquareDistance)) + return; + + if(useRestParticles) + { + // calculate distance in rest configuration, if less than collision + // distance then ignore collision between particles in deformed config + Simd4f restDiff = pos1rest - pos0rest; + Simd4f restDistSqr = dot3(restDiff, restDiff); + + if(allGreater(mCollisionSquareDistance, restDistSqr)) + return; + } + + Simd4f w0 = splat<3>(pos0); + Simd4f w1 = splat<3>(pos1); + + Simd4f ratio = mCollisionDistance * rsqrt(distSqr); + Simd4f scale = mStiffness * recip(gSimd4fEpsilon + w0 + w1); + Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ; + + pos0 = pos0 + delta * w0; + pos1 = pos1 - delta * w1; + +#if PX_DEBUG || PX_PROFILE + ++mNumCollisions; +#endif +} + +template <typename Simd4f> +template <bool useRestParticles> +void cloth::SwSelfCollision<Simd4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize, + const uint16_t* indices, uint32_t collisionDistance) +{ + Simd4f* __restrict particles = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* __restrict restParticles = + useRestParticles ? reinterpret_cast<Simd4f*>(mClothData.mRestPositions) : particles; + + const uint32_t bucketMask = uint16_t(-1); + + const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 }; + + const uint32_t* __restrict kFirst[5]; + const uint32_t* __restrict kLast[5]; + + { + // optimization: scan forward iterator starting points once instead of 9 times + const uint32_t* __restrict kIt = keys; + + uint32_t key = *kIt; + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + kFirst[0] = kIt; + while(*kIt < lastKey) + ++kIt; + kLast[0] = kIt; + + for(uint32_t k = 1; k < 5; ++k) + { + for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kFirst[k] = kIt; + + for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kLast[k] = kIt; + + // jump forward once to second column + kIt = keys + firstColumnSize; + firstColumnSize = 0; + } + } + + const uint16_t* __restrict iIt = indices; + const uint16_t* __restrict iEnd = indices + mClothData.mNumSelfCollisionIndices; + + const uint16_t* __restrict jIt; + const uint16_t* __restrict jEnd; + + for(; iIt != iEnd; ++iIt, ++kFirst[0]) + { + PX_ASSERT(*iIt < mClothData.mNumParticles); + + // load current particle once outside of inner loop + Simd4f particle = particles[*iIt]; + Simd4f restParticle = restParticles[*iIt]; + + uint32_t key = *kFirst[0]; + + // range of keys we need to check against for this particle + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + // scan forward end point + while(*kLast[0] < lastKey) + ++kLast[0]; + + // process potential colliders of same cell + jEnd = indices + (kLast[0] - keys); + for(jIt = iIt + 1; jIt != jEnd; ++jIt) + collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]); + + // process neighbor cells + for(uint32_t k = 1; k < 5; ++k) + { + // scan forward start point + for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;) + ++kFirst[k]; + + // scan forward end point + for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;) + ++kLast[k]; + + // process potential colliders + jEnd = indices + (kLast[k] - keys); + for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt) + collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]); + } + + // store current particle + particles[*iIt] = particle; + } +} + +// explicit template instantiation +#if NV_SIMD_SIMD +template class cloth::SwSelfCollision<Simd4f>; +#endif +#if NV_SIMD_SCALAR +template class cloth::SwSelfCollision<Scalar4f>; +#endif diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.h new file mode 100644 index 00000000..eabeb1ee --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSelfCollision.h @@ -0,0 +1,83 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "StackAllocator.h" +#include "Simd.h" + +namespace physx +{ +namespace cloth +{ + +class SwCloth; +struct SwClothData; + +typedef StackAllocator<16> SwKernelAllocator; + +template <typename Simd4f> +class SwSelfCollision +{ + typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i; + + public: + SwSelfCollision(SwClothData& clothData, SwKernelAllocator& alloc); + ~SwSelfCollision(); + + void operator()(); + + static size_t estimateTemporaryMemory(const SwCloth&); + + private: + SwSelfCollision& operator=(const SwSelfCollision&); // not implemented + static size_t getBufferSize(uint32_t); + + template <bool useRestParticles> + void collideParticles(Simd4f&, Simd4f&, const Simd4f&, const Simd4f&); + + template <bool useRestParticles> + void collideParticles(const uint32_t*, uint16_t, const uint16_t*, uint32_t); + + Simd4f mCollisionDistance; + Simd4f mCollisionSquareDistance; + Simd4f mStiffness; + + SwClothData& mClothData; + SwKernelAllocator& mAllocator; + + public: + mutable uint32_t mNumTests; + mutable uint32_t mNumCollisions; +}; + +} // namespace cloth + +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.cpp new file mode 100644 index 00000000..65a4b6c6 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.cpp @@ -0,0 +1,294 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxProfiler.h" +#include "SwSolver.h" +#include "SwCloth.h" +#include "ClothImpl.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "SwClothData.h" +#include "SwSolverKernel.h" +#include "SwInterCollision.h" +#include "PsFPU.h" +#include "PsFoundation.h" +#include "PsSort.h" + +namespace physx +{ +namespace cloth +{ +bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&); +} +} + +using namespace physx; + +#if NV_SIMD_SIMD +typedef Simd4f Simd4fType; +#else +typedef Scalar4f Simd4fType; +#endif + +cloth::SwSolver::SwSolver(physx::PxTaskManager* taskMgr) +: mInterCollisionDistance(0.0f) +, mInterCollisionStiffness(1.0f) +, mInterCollisionIterations(1) +, mInterCollisionScratchMem(NULL) +, mInterCollisionScratchMemSize(0) +{ + mStartSimulationTask.mSolver = this; + mEndSimulationTask.mSolver = this; + + PX_UNUSED(taskMgr); +} + +cloth::SwSolver::~SwSolver() +{ + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); + + PX_ASSERT(mCpuClothSimulationTasks.empty()); +} + +namespace +{ +template <typename T> +bool clothSizeGreater(const T& t0, const T& t1) +{ + return t0.mCloth->mCurParticles.size() > t1.mCloth->mCurParticles.size(); +} + +template <typename T> +void sortTasks(shdfnd::Array<T, physx::shdfnd::NonTrackingAllocator>& tasks) +{ + shdfnd::sort(tasks.begin(), tasks.size(), &clothSizeGreater<T>); +} +} + +void cloth::SwSolver::addCloth(Cloth* cloth) +{ + SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth; + + mCpuClothSimulationTasks.pushBack(CpuClothSimulationTask(swCloth, mEndSimulationTask)); + + sortTasks(mCpuClothSimulationTasks); +} + +void cloth::SwSolver::removeCloth(Cloth* cloth) +{ + SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth; + + CpuClothSimulationTaskVector::Iterator tIt = mCpuClothSimulationTasks.begin(); + CpuClothSimulationTaskVector::Iterator tEnd = mCpuClothSimulationTasks.end(); + while(tIt != tEnd && tIt->mCloth != &swCloth) + ++tIt; + + if(tIt != tEnd) + { + deallocate(tIt->mScratchMemory); + mCpuClothSimulationTasks.replaceWithLast(tIt); + sortTasks(mCpuClothSimulationTasks); + } +} + +physx::PxBaseTask& cloth::SwSolver::simulate(float dt, physx::PxBaseTask& continuation) +{ + if(mCpuClothSimulationTasks.empty()) + { + continuation.addReference(); + return continuation; + } + + mEndSimulationTask.setContinuation(&continuation); + mEndSimulationTask.mDt = dt; + + mStartSimulationTask.setContinuation(&mEndSimulationTask); + + mEndSimulationTask.removeReference(); + + return mStartSimulationTask; +} + +void cloth::SwSolver::interCollision() +{ + if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f) + return; + + float elasticity = 1.0f; + + // rebuild cloth instance array + mInterCollisionInstances.resize(0); + for(uint32_t i = 0; i < mCpuClothSimulationTasks.size(); ++i) + { + SwCloth* c = mCpuClothSimulationTasks[i].mCloth; + float invNumIterations = mCpuClothSimulationTasks[i].mInvNumIterations; + + mInterCollisionInstances.pushBack(SwInterCollisionData( + c->mCurParticles.begin(), c->mPrevParticles.begin(), + c->mSelfCollisionIndices.empty() ? c->mCurParticles.size() : c->mSelfCollisionIndices.size(), + c->mSelfCollisionIndices.empty() ? NULL : &c->mSelfCollisionIndices[0], c->mTargetMotion, + c->mParticleBoundsCenter, c->mParticleBoundsHalfExtent, elasticity * invNumIterations, c->mUserData)); + } + + const uint32_t requiredTempMemorySize = uint32_t(SwInterCollision<Simd4fType>::estimateTemporaryMemory( + &mInterCollisionInstances[0], mInterCollisionInstances.size())); + + // realloc temp memory if necessary + if(mInterCollisionScratchMemSize < requiredTempMemorySize) + { + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); + + mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem"); + mInterCollisionScratchMemSize = requiredTempMemorySize; + } + + SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize); + + // run inter-collision + SwInterCollision<Simd4fType> collider(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), + mInterCollisionDistance, mInterCollisionStiffness, mInterCollisionIterations, + mInterCollisionFilter, allocator); + + collider(); +} + +void cloth::SwSolver::beginFrame() const +{ + PX_PROFILE_START_CROSSTHREAD("cloth::SwSolver::simulate", 0); +} + +void cloth::SwSolver::endFrame() const +{ + PX_PROFILE_STOP_CROSSTHREAD("cloth::SwSolver::simulate", 0); +} + +void cloth::SwSolver::StartSimulationTask::runInternal() +{ + mSolver->beginFrame(); + + CpuClothSimulationTaskVector::Iterator tIt = mSolver->mCpuClothSimulationTasks.begin(); + CpuClothSimulationTaskVector::Iterator tEnd = mSolver->mCpuClothSimulationTasks.end(); + + for(; tIt != tEnd; ++tIt) + { + if(!tIt->mCloth->isSleeping()) + { + tIt->setContinuation(mCont); + tIt->removeReference(); + } + } +} + +const char* cloth::SwSolver::StartSimulationTask::getName() const +{ + return "cloth.SwSolver.startSimulation"; +} + +void cloth::SwSolver::EndSimulationTask::runInternal() +{ + mSolver->interCollision(); + mSolver->endFrame(); +} + +const char* cloth::SwSolver::EndSimulationTask::getName() const +{ + return "cloth.SwSolver.endSimulation"; +} + +cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, EndSimulationTask& continuation) +: mCloth(&cloth), mContinuation(&continuation), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f) +{ +} + +void cloth::SwSolver::CpuClothSimulationTask::runInternal() +{ + // check if we need to reallocate the temp memory buffer + // (number of shapes may have changed) + uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth)); + + if(mScratchMemorySize < requiredTempMemorySize) + { + deallocate(mScratchMemory); + + mScratchMemory = allocate(requiredTempMemorySize); + mScratchMemorySize = requiredTempMemorySize; + } + + if(mContinuation->mDt == 0.0f) + return; + + IterationStateFactory factory(*mCloth, mContinuation->mDt); + mInvNumIterations = factory.mInvNumIterations; + + shdfnd::SIMDGuard simdGuard; + + SwClothData data(*mCloth, mCloth->mFabric); + SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize)); + +// construct kernel functor and execute +#if PX_ANDROID +// if(!neonSolverKernel(cloth, data, allocator, factory)) +#endif + SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory)(); + + data.reconcile(*mCloth); // update cloth +} + +const char* cloth::SwSolver::CpuClothSimulationTask::getName() const +{ + return "cloth.SwSolver.cpuClothSimulation"; +} + +void cloth::SwSolver::CpuClothSimulationTask::release() +{ + mCloth->mMotionConstraints.pop(); + mCloth->mSeparationConstraints.pop(); + + if(!mCloth->mTargetCollisionSpheres.empty()) + { + swap(mCloth->mStartCollisionSpheres, mCloth->mTargetCollisionSpheres); + mCloth->mTargetCollisionSpheres.resize(0); + } + + if(!mCloth->mTargetCollisionPlanes.empty()) + { + swap(mCloth->mStartCollisionPlanes, mCloth->mTargetCollisionPlanes); + mCloth->mTargetCollisionPlanes.resize(0); + } + + if(!mCloth->mTargetCollisionTriangles.empty()) + { + swap(mCloth->mStartCollisionTriangles, mCloth->mTargetCollisionTriangles); + mCloth->mTargetCollisionTriangles.resize(0); + } + + mContinuation->removeReference(); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.h b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.h new file mode 100644 index 00000000..5e1fe975 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolver.h @@ -0,0 +1,153 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Solver.h" +#include "Allocator.h" +#include "SwInterCollision.h" +#include "CmTask.h" + +namespace physx +{ + +namespace cloth +{ + +class SwCloth; +class SwFactory; + +/// CPU/SSE based cloth solver +class SwSolver : public UserAllocated, public Solver +{ + struct StartSimulationTask : public Cm::Task + { + using physx::PxLightCpuTask::mRefCount; + using physx::PxLightCpuTask::mTm; + + virtual void runInternal(); + virtual const char* getName() const; + SwSolver* mSolver; + }; + + struct EndSimulationTask : public Cm::Task + { + using physx::PxLightCpuTask::mRefCount; + + virtual void runInternal(); + virtual const char* getName() const; + SwSolver* mSolver; + float mDt; + }; + + struct CpuClothSimulationTask : public Cm::Task + { + CpuClothSimulationTask(SwCloth&, EndSimulationTask&); + virtual void runInternal(); + virtual const char* getName() const; + virtual void release(); + + SwCloth* mCloth; + EndSimulationTask* mContinuation; + uint32_t mScratchMemorySize; + void* mScratchMemory; + float mInvNumIterations; + }; + + public: + SwSolver(physx::PxTaskManager*); + virtual ~SwSolver(); + + virtual void addCloth(Cloth*); + virtual void removeCloth(Cloth*); + + virtual physx::PxBaseTask& simulate(float dt, physx::PxBaseTask&); + + virtual void setInterCollisionDistance(float distance) + { + mInterCollisionDistance = distance; + } + virtual float getInterCollisionDistance() const + { + return mInterCollisionDistance; + } + + virtual void setInterCollisionStiffness(float stiffness) + { + mInterCollisionStiffness = stiffness; + } + virtual float getInterCollisionStiffness() const + { + return mInterCollisionStiffness; + } + + virtual void setInterCollisionNbIterations(uint32_t nbIterations) + { + mInterCollisionIterations = nbIterations; + } + virtual uint32_t getInterCollisionNbIterations() const + { + return mInterCollisionIterations; + } + + virtual void setInterCollisionFilter(InterCollisionFilter filter) + { + mInterCollisionFilter = filter; + } + + virtual bool hasError() const + { + return false; + } + + private: + void beginFrame() const; + void endFrame() const; + + void interCollision(); + + private: + StartSimulationTask mStartSimulationTask; + + typedef Vector<CpuClothSimulationTask>::Type CpuClothSimulationTaskVector; + CpuClothSimulationTaskVector mCpuClothSimulationTasks; + + EndSimulationTask mEndSimulationTask; + + float mInterCollisionDistance; + float mInterCollisionStiffness; + uint32_t mInterCollisionIterations; + InterCollisionFilter mInterCollisionFilter; + + void* mInterCollisionScratchMem; + uint32_t mInterCollisionScratchMemSize; + shdfnd::Array<SwInterCollisionData> mInterCollisionInstances; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.cpp b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.cpp new file mode 100644 index 00000000..bf5d86a1 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.cpp @@ -0,0 +1,781 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxProfiler.h" +#include "SwSolverKernel.h" +#include "SwCloth.h" +#include "SwClothData.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "PointInterpolator.h" +#include "BoundingBox.h" + +#define PX_AVX (NV_SIMD_SIMD&&(PX_WIN32 || PX_WIN64) && PX_VC >= 10) + +#if PX_AVX +namespace avx +{ +// defined in SwSolveConstraints.cpp + +void initialize(); + +template <bool, uint32_t> +void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict, + const __m128&); +} + +namespace +{ +uint32_t getAvxSupport() +{ +// Checking for AVX requires 3 things: +// 1) CPUID indicates that the OS uses XSAVE and XRSTORE +// 2) CPUID indicates support for AVX +// 3) XGETBV indicates registers are saved and restored on context switch + +#if _MSC_FULL_VER < 160040219 || !defined(_XCR_XFEATURE_ENABLED_MASK) + // need at least VC10 SP1 and compile on at least Win7 SP1 + return 0; +#else + int cpuInfo[4]; + __cpuid(cpuInfo, 1); + int avxFlags = 3 << 27; // checking 1) and 2) above + if((cpuInfo[2] & avxFlags) != avxFlags) + return 0; // xgetbv not enabled or no AVX support + + if((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) != 0x6) + return 0; // OS does not save YMM registers + + avx::initialize(); + +#if _MSC_VER < 1700 + return 1; +#else + int fmaFlags = 1 << 12; + if((cpuInfo[2] & fmaFlags) != fmaFlags) + return 1; // no FMA3 support + + /* only using fma at the moment, don't lock out AMD's piledriver by requiring avx2 + __cpuid(cpuInfo, 7); + int avx2Flags = 1 << 5; + if((cpuInfo[1] & avx2Flags) != avx2Flags) + return 1; // no AVX2 support + */ + + return 2; +#endif // _MSC_VER +#endif // _MSC_FULL_VER +} + +const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA +} +#endif + +using namespace physx; + +namespace +{ +/* simd constants */ + +const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0)); +const Simd4fTupleFactory sMaskXY = simd4f(simd4i(~0, ~0, 0, 0)); +const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); +const Simd4fTupleFactory sMaskYZW = simd4f(simd4i(0, ~0, ~0, ~0)); +const Simd4fTupleFactory sMinusOneXYZOneW = simd4f(-1.0f, -1.0f, -1.0f, 1.0f); +const Simd4fTupleFactory sFloatMaxW = simd4f(0.0f, 0.0f, 0.0f, FLT_MAX); +const Simd4fTupleFactory sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); + +/* static worker functions */ + +/** + This function performs explicit Euler integration based on position, where + x_next = x_cur + (x_cur - x_prev) * dt_cur/dt_prev * damping + g * dt * dt + The g * dt * dt term is folded into accelIt. + */ + +template <typename Simd4f, typename AccelerationIterator> +void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt, + const Simd4f& scale, const AccelerationIterator& aIt, const Simd4f& prevBias) +{ + // local copy to avoid LHS + AccelerationIterator accelIt(aIt); + + for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt) + { + Simd4f current = *curIt; + Simd4f previous = *prevIt; + // if(current.w == 0) current.w = previous.w + current = select(current > sMinusFloatMaxXYZ, current, previous); + Simd4f finiteMass = splat<3>(previous) > sFloatMaxW; + Simd4f delta = (current - previous) * scale + *accelIt; + *curIt = current + (delta & finiteMass); + *prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass); + } +} + +template <typename Simd4f, typename AccelerationIterator> +void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt, + const Simd4f (&prevMatrix)[3], const Simd4f (&curMatrix)[3], const AccelerationIterator& aIt, + const Simd4f& prevBias) +{ + // local copy to avoid LHS + AccelerationIterator accelIt(aIt); + + for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt) + { + Simd4f current = *curIt; + Simd4f previous = *prevIt; + // if(current.w == 0) current.w = previous.w + current = select(current > sMinusFloatMaxXYZ, current, previous); + Simd4f finiteMass = splat<3>(previous) > sFloatMaxW; + // curMatrix*current + prevMatrix*previous + accel + Simd4f delta = cloth::transform(curMatrix, cloth::transform(prevMatrix, *accelIt, previous), current); + *curIt = current + (delta & finiteMass); + *prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass); + } +} + +template <typename Simd4f, typename ConstraintIterator> +void constrainMotion(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres, + const Simd4f& scaleBiasStiffness) +{ + Simd4f scale = splat<0>(scaleBiasStiffness); + Simd4f bias = splat<1>(scaleBiasStiffness); + Simd4f stiffness = splat<3>(scaleBiasStiffness); + + // local copy of iterator to maintain alignment + ConstraintIterator sphIt = spheres; + + for(; curIt < curEnd; curIt += 4) + { + // todo: use msub where available + Simd4f curPos0 = curIt[0]; + Simd4f curPos1 = curIt[1]; + Simd4f curPos2 = curIt[2]; + Simd4f curPos3 = curIt[3]; + + Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0); + ++sphIt; + Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1); + ++sphIt; + Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2); + ++sphIt; + Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3); + ++sphIt; + + Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3; + transpose(deltaX, deltaY, deltaZ, deltaW); + + Simd4f sqrLength = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + Simd4f radius = max(gSimd4fZero, deltaW * scale + bias); + + Simd4f slack = gSimd4fOne - radius * rsqrt(sqrLength); + + // if slack <= 0.0f then we don't want to affect particle + // and can skip if all particles are unaffected + Simd4f isPositive; + if(anyGreater(slack, gSimd4fZero, isPositive)) + { + // set invMass to zero if radius is zero + curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ); + curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ); + curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ); + curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ); + + slack = slack * stiffness & isPositive; + + curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack); + curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack); + curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack); + curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack); + } + } +} + +template <typename Simd4f, typename ConstraintIterator> +void constrainSeparation(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres) +{ + // local copy of iterator to maintain alignment + ConstraintIterator sphIt = spheres; + + for(; curIt < curEnd; curIt += 4) + { + // todo: use msub where available + Simd4f curPos0 = curIt[0]; + Simd4f curPos1 = curIt[1]; + Simd4f curPos2 = curIt[2]; + Simd4f curPos3 = curIt[3]; + + Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0); + ++sphIt; + Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1); + ++sphIt; + Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2); + ++sphIt; + Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3); + ++sphIt; + + Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3; + transpose(deltaX, deltaY, deltaZ, deltaW); + + Simd4f sqrLength = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + + Simd4f slack = gSimd4fOne - deltaW * rsqrt<1>(sqrLength); + + // if slack >= 0.0f then we don't want to affect particle + // and can skip if all particles are unaffected + Simd4f isNegative; + if(anyGreater(gSimd4fZero, slack, isNegative)) + { + slack = slack & isNegative; + + curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack); + curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack); + curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack); + curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack); + } + } +} + +/** + traditional gauss-seidel internal constraint solver + */ +template <bool useMultiplier, typename Simd4f> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, const Simd4f& stiffnessEtc) +{ + Simd4f stretchLimit, compressionLimit, multiplier; + if(useMultiplier) + { + stretchLimit = splat<3>(stiffnessEtc); + compressionLimit = splat<2>(stiffnessEtc); + multiplier = splat<1>(stiffnessEtc); + } + Simd4f stiffness = splat<0>(stiffnessEtc); + + for(; rIt != rEnd; rIt += 4, iIt += 8) + { + uint32_t p0i = iIt[0] * sizeof(PxVec4); + uint32_t p0j = iIt[1] * sizeof(PxVec4); + uint32_t p1i = iIt[2] * sizeof(PxVec4); + uint32_t p1j = iIt[3] * sizeof(PxVec4); + uint32_t p2i = iIt[4] * sizeof(PxVec4); + uint32_t p2j = iIt[5] * sizeof(PxVec4); + uint32_t p3i = iIt[6] * sizeof(PxVec4); + uint32_t p3j = iIt[7] * sizeof(PxVec4); + + Simd4f v0i = loadAligned(posIt, p0i); + Simd4f v0j = loadAligned(posIt, p0j); + Simd4f v1i = loadAligned(posIt, p1i); + Simd4f v1j = loadAligned(posIt, p1j); + Simd4f v2i = loadAligned(posIt, p2i); + Simd4f v2j = loadAligned(posIt, p2j); + Simd4f v3i = loadAligned(posIt, p3i); + Simd4f v3j = loadAligned(posIt, p3j); + + Simd4f h0ij = v0j + v0i * sMinusOneXYZOneW; + Simd4f h1ij = v1j + v1i * sMinusOneXYZOneW; + Simd4f h2ij = v2j + v2i * sMinusOneXYZOneW; + Simd4f h3ij = v3j + v3i * sMinusOneXYZOneW; + + Simd4f hxij = h0ij, hyij = h1ij, hzij = h2ij, vwij = h3ij; + transpose(hxij, hyij, hzij, vwij); + + Simd4f rij = loadAligned(rIt); + Simd4f e2ij = gSimd4fEpsilon + hxij * hxij + hyij * hyij + hzij * hzij; + Simd4f erij = (gSimd4fOne - rij * rsqrt(e2ij)) & (rij > gSimd4fEpsilon); + + if(useMultiplier) + { + erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit)); + } + Simd4f exij = erij * stiffness * recip(gSimd4fEpsilon + vwij); + + h0ij = h0ij * splat<0>(exij) & sMaskXYZ; + h1ij = h1ij * splat<1>(exij) & sMaskXYZ; + h2ij = h2ij * splat<2>(exij) & sMaskXYZ; + h3ij = h3ij * splat<3>(exij) & sMaskXYZ; + + storeAligned(posIt, p0i, v0i + h0ij * splat<3>(v0i)); + storeAligned(posIt, p0j, v0j - h0ij * splat<3>(v0j)); + storeAligned(posIt, p1i, v1i + h1ij * splat<3>(v1i)); + storeAligned(posIt, p1j, v1j - h1ij * splat<3>(v1j)); + storeAligned(posIt, p2i, v2i + h2ij * splat<3>(v2i)); + storeAligned(posIt, p2j, v2j - h2ij * splat<3>(v2j)); + storeAligned(posIt, p3i, v3i + h3ij * splat<3>(v3i)); + storeAligned(posIt, p3j, v3j - h3ij * splat<3>(v3j)); + } +} + +#if PX_WINDOWS +#include "sse2/SwSolveConstraints.h" +#endif + +// calculates upper bound of all position deltas +template <typename Simd4f> +Simd4f calculateMaxDelta(const Simd4f* prevIt, const Simd4f* curIt, const Simd4f* curEnd) +{ + Simd4f maxDelta = gSimd4fZero; + for(; curIt < curEnd; ++curIt, ++prevIt) + maxDelta = max(maxDelta, abs(*curIt - *prevIt)); + + return maxDelta & sMaskXYZ; +} + +template <bool IsTurning, typename Simd4f> +void applyWind(Simd4f* __restrict curIt, const Simd4f* __restrict prevIt, const uint16_t* __restrict tIt, + const uint16_t* __restrict tEnd, Simd4f dragCoefficient, Simd4f liftCoefficient, Simd4f wind, + const Simd4f (&rotation)[3]) +{ + const Simd4f oneThird = simd4f(1 / 3.0f); + + for(; tIt < tEnd; tIt += 3) + { + uint16_t i0 = tIt[0]; + uint16_t i1 = tIt[1]; + uint16_t i2 = tIt[2]; + + Simd4f c0 = curIt[i0]; + Simd4f c1 = curIt[i1]; + Simd4f c2 = curIt[i2]; + + Simd4f p0 = prevIt[i0]; + Simd4f p1 = prevIt[i1]; + Simd4f p2 = prevIt[i2]; + + // use particle weights instead? + Simd4f current = (c0 + c1 + c2) * oneThird; + Simd4f previous = (p0 + p1 + p2) * oneThird; + + Simd4f delta = current - previous + wind; + + if(IsTurning) + { + // add rotation of frame + delta = cloth::transform(rotation, delta - current, current); + } + + Simd4f normal = cross3(c2 - c0, c1 - c0); + + Simd4f invSqrScale = dot3(delta, delta) * dot3(normal, normal); + Simd4f isZero = invSqrScale < gSimd4fEpsilon; + Simd4f scale = rsqrt(invSqrScale); + + Simd4f cosTheta = dot3(normal, delta) * scale; + Simd4f sinTheta = sqrt(max(gSimd4fZero, gSimd4fOne - cosTheta * cosTheta)); + + // orthogonal to delta, in delta-normal plane, same length as delta + Simd4f liftDir = cross3(cross3(delta, normal), delta * scale); + + // sin(theta) * cos(theta) = 0.5 * sin(2 * theta) + Simd4f lift = liftCoefficient * cosTheta * sinTheta * liftDir; + Simd4f drag = dragCoefficient * abs(cosTheta) * delta; + + Simd4f impulse = (lift + drag) & ~isZero; + + curIt[i0] = c0 - impulse * splat<3>(c0); + curIt[i1] = c1 - impulse * splat<3>(c1); + curIt[i2] = c2 - impulse * splat<3>(c2); + } +} + +} // anonymous namespace + +template <typename Simd4f> +cloth::SwSolverKernel<Simd4f>::SwSolverKernel(SwCloth const& cloth, SwClothData& clothData, + SwKernelAllocator& allocator, IterationStateFactory& factory) +: mCloth(cloth) +, mClothData(clothData) +, mAllocator(allocator) +, mCollision(clothData, allocator) +, mSelfCollision(clothData, allocator) +, mState(factory.create<Simd4f>(cloth)) +{ + mClothData.verify(); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::operator()() +{ + simulateCloth(); +} + +template <typename Simd4f> +size_t cloth::SwSolverKernel<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth) +{ + size_t collisionTempMemory = SwCollision<Simd4f>::estimateTemporaryMemory(cloth); + size_t selfCollisionTempMemory = SwSelfCollision<Simd4f>::estimateTemporaryMemory(cloth); + + size_t tempMemory = PxMax(collisionTempMemory, selfCollisionTempMemory); + size_t persistentMemory = SwCollision<Simd4f>::estimatePersistentMemory(cloth); + + // account for any allocator overhead (this could be exposed in the allocator) + size_t maxAllocs = 32; + size_t maxPerAllocationOverhead = 32; + size_t maxAllocatorOverhead = maxAllocs * maxPerAllocationOverhead; + + return maxAllocatorOverhead + persistentMemory + tempMemory; +} + +template <typename Simd4f> +template <typename AccelerationIterator> +void cloth::SwSolverKernel<Simd4f>::integrateParticles(AccelerationIterator& accelIt, const Simd4f& prevBias) +{ + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles); + + if(!mState.mIsTurning) + ::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix[0], accelIt, prevBias); + else + ::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix, mState.mCurMatrix, accelIt, prevBias); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::integrateParticles() +{ + PX_PROFILE_ZONE("cloth::SwSolverKernel::integrateParticles", 0); + + const Simd4f* startAccelIt = reinterpret_cast<const Simd4f*>(mClothData.mParticleAccelerations); + + // dt^2 (todo: should this be the smoothed dt used for gravity?) + const Simd4f sqrIterDt = simd4f(sqr(mState.mIterDt)) & static_cast<Simd4f>(sMaskXYZ); + + if(!startAccelIt) + { + // no per-particle accelerations, use a constant + ConstantIterator<Simd4f> accelIt(mState.mCurBias); + integrateParticles(accelIt, mState.mPrevBias); + } + else + { + // iterator implicitly scales by dt^2 and adds gravity + ScaleBiasIterator<Simd4f, const Simd4f*> accelIt(startAccelIt, sqrIterDt, mState.mCurBias); + integrateParticles(accelIt, mState.mPrevBias); + } +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::constrainTether() +{ + if(0.0f == mClothData.mTetherConstraintStiffness || !mClothData.mNumTethers) + return; + + PX_PROFILE_ZONE("cloth::SwSolverKernel::solveTethers", 0); + + uint32_t numParticles = mClothData.mNumParticles; + uint32_t numTethers = mClothData.mNumTethers; + PX_ASSERT(0 == numTethers % numParticles); + + float* __restrict curIt = mClothData.mCurParticles; + const float* __restrict curFirst = curIt; + const float* __restrict curEnd = curIt + 4 * numParticles; + + typedef const SwTether* __restrict TetherIter; + TetherIter tFirst = mClothData.mTethers; + TetherIter tEnd = tFirst + numTethers; + + Simd4f stiffness = + static_cast<Simd4f>(sMaskXYZ) & simd4f(numParticles * mClothData.mTetherConstraintStiffness / numTethers); + Simd4f scale = simd4f(mClothData.mTetherConstraintScale); + + for(; curIt != curEnd; curIt += 4, ++tFirst) + { + Simd4f position = loadAligned(curIt); + Simd4f offset = gSimd4fZero; + + for(TetherIter tIt = tFirst; tIt < tEnd; tIt += numParticles) + { + PX_ASSERT(tIt->mAnchor < numParticles); + Simd4f anchor = loadAligned(curFirst, tIt->mAnchor * sizeof(PxVec4)); + Simd4f delta = anchor - position; + Simd4f sqrLength = gSimd4fEpsilon + dot3(delta, delta); + + Simd4f tetherLength = load(&tIt->mLength); + tetherLength = splat<0>(tetherLength); + + Simd4f radius = tetherLength * scale; + Simd4f slack = gSimd4fOne - radius * rsqrt(sqrLength); + + offset = offset + delta * max(slack, gSimd4fZero); + } + + storeAligned(curIt, position + offset * stiffness); + } +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::solveFabric() +{ + PX_PROFILE_ZONE("cloth::SwSolverKernel::solveFabric", 0); + + float* pIt = mClothData.mCurParticles; + + const PhaseConfig* cIt = mClothData.mConfigBegin; + const PhaseConfig* cEnd = mClothData.mConfigEnd; + + const uint32_t* pBegin = mClothData.mPhases; + const float* rBegin = mClothData.mRestvalues; + + const uint32_t* sBegin = mClothData.mSets; + const uint16_t* iBegin = mClothData.mIndices; + + uint32_t totalConstraints = 0; + + Simd4f stiffnessExponent = simd4f(mCloth.mStiffnessFrequency * mState.mIterDt); + + for(; cIt != cEnd; ++cIt) + { + const uint32_t* sIt = sBegin + pBegin[cIt->mPhaseIndex]; + const float* rIt = rBegin + sIt[0]; + const float* rEnd = rBegin + sIt[1]; + const uint16_t* iIt = iBegin + sIt[0] * 2; + + totalConstraints += uint32_t(rEnd - rIt); + + // (stiffness, multiplier, compressionLimit, stretchLimit) + Simd4f config = load(&cIt->mStiffness); + // stiffness specified as fraction of constraint error per-millisecond + Simd4f scaledConfig = gSimd4fOne - exp2(config * stiffnessExponent); + Simd4f stiffness = select(sMaskXY, scaledConfig, config); + + int neutralMultiplier = allEqual(sMaskYZW & stiffness, gSimd4fZero); + +#if PX_AVX + switch(sAvxSupport) + { + case 2: +#if _MSC_VER >= 1700 + neutralMultiplier ? avx::solveConstraints<false, 2>(pIt, rIt, rEnd, iIt, stiffness) + : avx::solveConstraints<true, 2>(pIt, rIt, rEnd, iIt, stiffness); + break; +#endif + case 1: + neutralMultiplier ? avx::solveConstraints<false, 1>(pIt, rIt, rEnd, iIt, stiffness) + : avx::solveConstraints<true, 1>(pIt, rIt, rEnd, iIt, stiffness); + break; + default: +#endif + neutralMultiplier ? solveConstraints<false>(pIt, rIt, rEnd, iIt, stiffness) + : solveConstraints<true>(pIt, rIt, rEnd, iIt, stiffness); +#if PX_AVX + break; + } +#endif + } +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::applyWind() +{ + if(mClothData.mDragCoefficient == 0.0f && mClothData.mLiftCoefficient == 0.0f) + return; + + PX_PROFILE_ZONE("cloth::SwSolverKernel::applyWind", 0); + + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles); + + const uint16_t* tIt = mClothData.mTriangles; + const uint16_t* tEnd = tIt + 3 * mClothData.mNumTriangles; + + Simd4f dragCoefficient = simd4f(mClothData.mDragCoefficient); + Simd4f liftCoefficient = simd4f(mClothData.mLiftCoefficient); + + if(mState.mIsTurning) + { + ::applyWind<true>(curIt, prevIt, tIt, tEnd, dragCoefficient, liftCoefficient, mState.mWind, + mState.mRotationMatrix); + } + else + { + ::applyWind<false>(curIt, prevIt, tIt, tEnd, dragCoefficient, liftCoefficient, mState.mWind, + mState.mRotationMatrix); + } +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::constrainMotion() +{ + if(!mClothData.mStartMotionConstraints) + return; + + PX_PROFILE_ZONE("cloth::SwSolverKernel::constrainMotion", 0); + + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + + const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartMotionConstraints); + const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetMotionConstraints); + + Simd4f scaleBias = load(&mCloth.mMotionConstraintScale); + Simd4f stiffness = simd4f(mClothData.mMotionConstraintStiffness); + Simd4f scaleBiasStiffness = select(sMaskXYZ, scaleBias, stiffness); + + if(!mClothData.mTargetMotionConstraints) + // no interpolation, use the start positions + return ::constrainMotion(curIt, curEnd, startIt, scaleBiasStiffness); + + if(mState.mRemainingIterations == 1) + // use the target positions on last iteration + return ::constrainMotion(curIt, curEnd, targetIt, scaleBiasStiffness); + + // otherwise use an interpolating iterator + LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha()); + ::constrainMotion(curIt, curEnd, interpolator, scaleBiasStiffness); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::constrainSeparation() +{ + if(!mClothData.mStartSeparationConstraints) + return; + + PX_PROFILE_ZONE("cloth::SwSolverKernel::constrainSeparation", 0); + + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + + const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartSeparationConstraints); + const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetSeparationConstraints); + + if(!mClothData.mTargetSeparationConstraints) + // no interpolation, use the start positions + return ::constrainSeparation(curIt, curEnd, startIt); + + if(mState.mRemainingIterations == 1) + // use the target positions on last iteration + return ::constrainSeparation(curIt, curEnd, targetIt); + + // otherwise use an interpolating iterator + LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha()); + ::constrainSeparation(curIt, curEnd, interpolator); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::collideParticles() +{ + PX_PROFILE_ZONE("cloth::SwSolverKernel::collideParticles", 0); + + mCollision(mState); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::selfCollideParticles() +{ + PX_PROFILE_ZONE("cloth::SwSolverKernel::selfCollideParticles", 0); + + mSelfCollision(); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::updateSleepState() +{ + PX_PROFILE_ZONE("cloth::SwSolverKernel::updateSleepState", 0); + + mClothData.mSleepTestCounter += PxMax(1u, uint32_t(mState.mIterDt * 1000)); + if(mClothData.mSleepTestCounter >= mCloth.mSleepTestInterval) + { + const Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles); + const Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + const Simd4f* curEnd = curIt + mClothData.mNumParticles; + + // calculate max particle delta since last iteration + Simd4f maxDelta = calculateMaxDelta(prevIt, curIt, curEnd); + + ++mClothData.mSleepPassCounter; + Simd4f threshold = simd4f(mCloth.mSleepThreshold * mState.mIterDt); + if(anyGreaterEqual(maxDelta, threshold)) + mClothData.mSleepPassCounter = 0; + + mClothData.mSleepTestCounter -= mCloth.mSleepTestInterval; + } +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::iterateCloth() +{ + // note on invMass (stored in current/previous positions.w): + // integrateParticles() + // - if(current.w == 0) current.w = previous.w + // constraintMotion() + // - if(constraint.radius <= 0) current.w = 0 + // computeBounds() + // - if(current.w > 0) current.w = previous.w + // collideParticles() + // - if(collides) current.w *= 1/massScale + // after simulate() + // - previous.w: original invMass as set by user + // - current.w: zeroed by motion constraints and mass-scaled by collision + + // integrate positions + integrateParticles(); + + // apply drag and lift + applyWind(); + + // motion constraints + constrainMotion(); + + // solve tether constraints + constrainTether(); + + // solve edge constraints + solveFabric(); + + // separation constraints + constrainSeparation(); + + // perform character collision + collideParticles(); + + // perform self collision + selfCollideParticles(); + + // test wake / sleep conditions + updateSleepState(); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::simulateCloth() +{ + while(mState.mRemainingIterations) + { + iterateCloth(); + mState.update(); + } +} + +// explicit template instantiation +#if NV_SIMD_SIMD +template class cloth::SwSolverKernel<Simd4f>; +#endif +#if NV_SIMD_SCALAR +template class cloth::SwSolverKernel<Scalar4f>; +#endif diff --git a/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.h b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.h new file mode 100644 index 00000000..9ad546c0 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/SwSolverKernel.h @@ -0,0 +1,84 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "IterationState.h" +#include "SwCollision.h" +#include "SwSelfCollision.h" + +namespace physx +{ +namespace cloth +{ + +class SwCloth; +struct SwClothData; + +template <typename Simd4f> +class SwSolverKernel +{ + public: + SwSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&); + + void operator()(); + + // returns a conservative estimate of the + // total memory requirements during a solve + static size_t estimateTemporaryMemory(const SwCloth& c); + + private: + void integrateParticles(); + void constrainTether(); + void solveFabric(); + void applyWind(); + void constrainMotion(); + void constrainSeparation(); + void collideParticles(); + void selfCollideParticles(); + void updateSleepState(); + + void iterateCloth(); + void simulateCloth(); + + SwCloth const& mCloth; + SwClothData& mClothData; + SwKernelAllocator& mAllocator; + + SwCollision<Simd4f> mCollision; + SwSelfCollision<Simd4f> mSelfCollision; + IterationState<Simd4f> mState; + + private: + SwSolverKernel<Simd4f>& operator=(const SwSolverKernel<Simd4f>&); + template <typename AccelerationIterator> + void integrateParticles(AccelerationIterator& accelIt, const Simd4f&); +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.cpp b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.cpp new file mode 100644 index 00000000..ea062136 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.cpp @@ -0,0 +1,246 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxMath.h" +#include "TripletScheduler.h" +#include "PsUtilities.h" +#include "PsFPU.h" + +using namespace physx; +using namespace shdfnd; + +cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets) +: mTriplets(reinterpret_cast<const Vec4u*>(triplets.begin()), reinterpret_cast<const Vec4u*>(triplets.end())) +{ +} + +// SSE version +void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth) +{ + if(mTriplets.empty()) + return; + + Vector<uint32_t>::Type mark(numParticles, uint32_t(-1)); + + uint32_t setIndex = 0, setSize = 0; + for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex) + { + TripletIter tLast = tIt + PxMin(simdWidth, uint32_t(tEnd - tIt)); + TripletIter tSwap = tEnd; + + for(; tIt != tLast && tIt != tSwap; ++tIt, ++setSize) + { + // swap from tail until independent triplet found + while((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap) + swap(*tIt, *tSwap); + + if(tIt == tSwap) + break; // no independent triplet found + + // mark vertices to be used in simdIndex + mark[tIt->x] = setIndex; + mark[tIt->y] = setIndex; + mark[tIt->z] = setIndex; + } + + if(tIt == tSwap) // remaining triplets depend on current set + { + if(setSize > simdWidth) // trim set to multiple of simdWidth + { + uint32_t overflow = setSize % simdWidth; + setSize -= overflow; + tIt -= overflow; + } + mSetSizes.pushBack(setSize); + setSize = 0; + } + } +} + +namespace +{ +struct TripletSet +{ + TripletSet() : mMark(0xFFFFFFFF) + { + mNumReplays[0] = mNumReplays[1] = mNumReplays[2] = 1; + memset(mNumConflicts[0], 0, 32); + memset(mNumConflicts[1], 0, 32); + memset(mNumConflicts[2], 0, 32); + } + + uint32_t mMark; // triplet index + uint8_t mNumReplays[3]; + uint8_t mNumConflicts[3][32]; +}; + +/* +struct GreaterSum +{ + typedef cloth::Vector<uint32_t>::Type Container; + + GreaterSum(const Container& cont) + : mContainer(cont) + {} + + bool operator()(const cloth::Vec4u& a, const cloth::Vec4u& b) const + { + return mContainer[a.x] + mContainer[a.y] + mContainer[a.z] + > mContainer[b.x] + mContainer[b.y] + mContainer[b.z]; + } + + const Container& mContainer; +}; +*/ + +// calculate the inclusive prefix sum, equivalent of std::partial_sum +template <typename T> +void prefixSum(const T* first, const T* last, T* dest) +{ + if(first == last) + return; + else + { + *(dest++) = *(first++); + + for(; first != last; ++first, ++dest) + *dest = *(dest - 1) + *first; + } +} +} + +// CUDA version +void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth) +{ + // PX_ASSERT(warpWidth == 32 || warpWidth == 16); + + if(mTriplets.empty()) + return; + + TripletIter tIt, tEnd = mTriplets.end(); + uint32_t tripletIndex; + + // count number of triplets per particle + Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0)); + for(tIt = mTriplets.begin(); tIt != tEnd; ++tIt) + for(int i = 0; i < 3; ++i) + ++adjacentCount[(*tIt)[i]]; + + /* neither of those were really improving number of batches: + // run simd version to pre-sort particles + simd(numParticles, blockWidth); mSetSizes.resize(0); + // sort according to triplet degree (estimated by sum of adjacentCount) + std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount)); + */ + + uint32_t maxTripletCount = *maxElement(adjacentCount.begin(), adjacentCount.end()); + + // compute in place prefix sum (inclusive) + prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin()); + + // initialize adjacencies (for each particle, collect touching triplets) + // also converts partial sum in adjacentCount from inclusive to exclusive + Vector<uint32_t>::Type adjacencies(adjacentCount.back()); + for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex) + for(int i = 0; i < 3; ++i) + adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex; + + uint32_t warpMask = warpWidth - 1; + + uint32_t numSets = maxTripletCount; // start with minimum number of sets + Vector<TripletSet>::Type sets(numSets); + Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1)); + mSetSizes.resize(numSets); + + // color triplets (assign to sets) + Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd; + for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex) + { + // mark sets of adjacent triplets + for(int i = 0; i < 3; ++i) + { + uint32_t particleIndex = (*tIt)[i]; + aIt = aBegin + adjacentCount[particleIndex]; + aEnd = aBegin + adjacentCount[particleIndex + 1]; + for(uint32_t setIndex; aIt != aEnd; ++aIt) + if(numSets > (setIndex = setIndices[*aIt])) + sets[setIndex].mMark = tripletIndex; + } + + // find valid set with smallest number of bank conflicts + uint32_t bestIndex = numSets; + uint32_t minReplays = 4; + for(uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex) + { + const TripletSet& set = sets[setIndex]; + + if(set.mMark == tripletIndex) + continue; // triplet collision + + uint32_t numReplays = 0; + for(uint32_t i = 0; i < 3; ++i) + numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]]; + + if(minReplays > numReplays) + { + minReplays = numReplays; + bestIndex = setIndex; + } + } + + // add new set if none found + if(bestIndex == numSets) + { + sets.pushBack(TripletSet()); + mSetSizes.pushBack(0); + ++numSets; + } + + // increment bank conflicts or reset if warp filled + TripletSet& set = sets[bestIndex]; + if(++mSetSizes[bestIndex] & warpMask) + for(uint32_t i = 0; i < 3; ++i) + set.mNumReplays[i] = PxMax(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]); + else + set = TripletSet(); + + setIndices[tripletIndex] = bestIndex; + } + + // reorder triplets + Vector<uint32_t>::Type setOffsets(mSetSizes.size()); + prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin()); + + Vector<Vec4u>::Type triplets(mTriplets.size()); + Vector<uint32_t>::Type::ConstIterator iIt = setIndices.begin(); + for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++iIt) + triplets[--setOffsets[*iIt]] = *tIt; + + mTriplets.swap(triplets); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.h b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.h new file mode 100644 index 00000000..db8078ab --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/TripletScheduler.h @@ -0,0 +1,56 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "Range.h" +#include "Allocator.h" +#include "Vec4T.h" + +namespace physx +{ + +namespace cloth +{ + +struct TripletScheduler +{ + typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter; + typedef Vector<Vec4u>::Type::Iterator TripletIter; + + TripletScheduler(Range<const uint32_t[4]>); + void simd(uint32_t numParticles, uint32_t simdWidth); + void warp(uint32_t numParticles, uint32_t warpWidth); + + Vector<Vec4u>::Type mTriplets; + Vector<uint32_t>::Type mSetSizes; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/Vec4T.h b/PhysX_3.4/Source/LowLevelCloth/src/Vec4T.h new file mode 100644 index 00000000..50fadca3 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/Vec4T.h @@ -0,0 +1,104 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ + +namespace cloth +{ + +template <typename T> +struct Vec4T +{ + Vec4T() + { + } + + Vec4T(T a, T b, T c, T d) : x(a), y(b), z(c), w(d) + { + } + + template <typename S> + Vec4T(const Vec4T<S>& other) + { + x = T(other.x); + y = T(other.y); + z = T(other.z); + w = T(other.w); + } + + template <typename Index> + T& operator[](Index i) + { + return reinterpret_cast<T*>(this)[i]; + } + + template <typename Index> + const T& operator[](Index i) const + { + return reinterpret_cast<const T*>(this)[i]; + } + + T x, y, z, w; +}; + +template <typename T> +Vec4T<T> operator*(const Vec4T<T>& vec, T scalar) +{ + return Vec4T<T>(vec.x * scalar, vec.y * scalar, vec.z * scalar, vec.w * scalar); +} + +template <typename T> +Vec4T<T> operator/(const Vec4T<T>& vec, T scalar) +{ + return Vec4T<T>(vec.x / scalar, vec.y / scalar, vec.z / scalar, vec.w / scalar); +} + +template <typename T> +T (&array(Vec4T<T>& vec))[4] +{ + return reinterpret_cast<T(&)[4]>(vec); +} + +template <typename T> +const T (&array(const Vec4T<T>& vec))[4] +{ + return reinterpret_cast<const T(&)[4]>(vec); +} + +typedef Vec4T<uint32_t> Vec4u; +typedef Vec4T<uint16_t> Vec4us; + +} // namespace cloth + +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp b/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp new file mode 100644 index 00000000..b242aaba --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp @@ -0,0 +1,932 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma warning(push) +#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' +#include <intrin.h> +#pragma warning(pop) + +#pragma warning(disable : 4127) // conditional expression is constant + +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; + +namespace avx +{ +__m128 sMaskYZW; +__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY; + +void initialize() +{ + sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0)); + sOne = _mm256_set1_ps(1.0f); + sEpsilon = _mm256_set1_ps(1.192092896e-07f); + sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f); + sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0)); +} + +template <uint32_t> +__m256 fmadd_ps(__m256 a, __m256 b, __m256 c) +{ + return _mm256_add_ps(_mm256_mul_ps(a, b), c); +} +template <uint32_t> +__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c) +{ + return _mm256_sub_ps(c, _mm256_mul_ps(a, b)); +} +#if _MSC_VER >= 1700 +template <> +__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c) +{ + return _mm256_fmadd_ps(a, b, c); +} +template <> +__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c) +{ + return _mm256_fnmadd_ps(a, b, c); +} +#endif + +// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster +template <bool useMultiplier, uint32_t avx> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness, stretchLimit, compressionLimit, multiplier; + + if(useMultiplier) + { + stiffness = _mm256_broadcast_ps(&stiffnessRef); + stretchLimit = _mm256_permute_ps(stiffness, 0xff); + compressionLimit = _mm256_permute_ps(stiffness, 0xaa); + multiplier = _mm256_permute_ps(stiffness, 0x55); + stiffness = _mm256_permute_ps(stiffness, 0x00); + } + else + { + stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef); + } + + for(; rIt < rEnd; rIt += 8, iIt += 16) + { + float* p0i = posIt + iIt[0] * 4; + float* p4i = posIt + iIt[8] * 4; + float* p0j = posIt + iIt[1] * 4; + float* p4j = posIt + iIt[9] * 4; + float* p1i = posIt + iIt[2] * 4; + float* p5i = posIt + iIt[10] * 4; + float* p1j = posIt + iIt[3] * 4; + float* p5j = posIt + iIt[11] * 4; + + __m128 v0i = _mm_load_ps(p0i); + __m128 v4i = _mm_load_ps(p4i); + __m128 v0j = _mm_load_ps(p0j); + __m128 v4j = _mm_load_ps(p4j); + __m128 v1i = _mm_load_ps(p1i); + __m128 v5i = _mm_load_ps(p5i); + __m128 v1j = _mm_load_ps(p1j); + __m128 v5j = _mm_load_ps(p5j); + + __m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1); + __m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1); + __m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1); + __m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1); + + __m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j); + __m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j); + + float* p2i = posIt + iIt[4] * 4; + float* p6i = posIt + iIt[12] * 4; + float* p2j = posIt + iIt[5] * 4; + float* p6j = posIt + iIt[13] * 4; + float* p3i = posIt + iIt[6] * 4; + float* p7i = posIt + iIt[14] * 4; + float* p3j = posIt + iIt[7] * 4; + float* p7j = posIt + iIt[15] * 4; + + __m128 v2i = _mm_load_ps(p2i); + __m128 v6i = _mm_load_ps(p6i); + __m128 v2j = _mm_load_ps(p2j); + __m128 v6j = _mm_load_ps(p6j); + __m128 v3i = _mm_load_ps(p3i); + __m128 v7i = _mm_load_ps(p7i); + __m128 v3j = _mm_load_ps(p3j); + __m128 v7j = _mm_load_ps(p7j); + + __m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1); + __m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1); + __m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1); + __m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1); + + __m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j); + __m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j); + + __m256 a = _mm256_unpacklo_ps(h04ij, h26ij); + __m256 b = _mm256_unpackhi_ps(h04ij, h26ij); + __m256 c = _mm256_unpacklo_ps(h15ij, h37ij); + __m256 d = _mm256_unpackhi_ps(h15ij, h37ij); + + __m256 hxij = _mm256_unpacklo_ps(a, c); + __m256 hyij = _mm256_unpackhi_ps(a, c); + __m256 hzij = _mm256_unpacklo_ps(b, d); + __m256 vwij = _mm256_unpackhi_ps(b, d); + + __m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon))); + + __m256 rij = _mm256_load_ps(rIt); + __m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ); + __m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask); + + if(useMultiplier) + { + erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij); + } + + __m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stiffness, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij)))); + + // replace these two instructions with _mm_maskstore_ps below? + __m256 exlo = _mm256_and_ps(sMaskXY, exij); + __m256 exhi = _mm256_andnot_ps(sMaskXY, exij); + + __m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0)); + __m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i); + __m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j); + + _mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0)); + _mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0)); + _mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1)); + _mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1)); + + __m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5)); + __m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i); + __m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j); + + _mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0)); + _mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0)); + _mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1)); + _mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1)); + + __m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a)); + __m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i); + __m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j); + + _mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0)); + _mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0)); + _mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1)); + _mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1)); + + __m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f)); + __m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i); + __m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j); + + _mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0)); + _mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0)); + _mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1)); + _mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1)); + } + + _mm256_zeroupper(); +} + +#ifdef _M_IX86 + +// clang-format:disable + +/* full template specializations of above functions in assembler */ + +// AVX without useMultiplier +template <> +void solveConstraints<false, 1>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij + vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2 + vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij + vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} + +// AVX with useMultiplier +template <> +void solveConstraints<true, 1>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef); + __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff); + __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa); + __m256 multiplier = _mm256_permute_ps(stiffness, 0x55); + stiffness = _mm256_permute_ps(stiffness, 0x00); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij + vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2 + vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij + vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmovaps ymm0, stretchLimit // multiplier block + vmovaps ymm1, compressionLimit + vmovaps ymm2, multiplier + vminps ymm0, ymm0, ymm5 + vmaxps ymm1, ymm1, ymm0 + vmulps ymm2, ymm2, ymm1 + vsubps ymm5, ymm5, ymm2 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} + +#if _MSC_VER >= 1700 +// AVX2 without useMultiplier +template <> +void solveConstraints<false, 2>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vfmadd213ps ymm4, ymm4, ymm7 // e2ij + vfmadd213ps ymm2, ymm2, ymm4 + vfmadd213ps ymm0, ymm0, ymm2 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij + vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} + +// AVX2 with useMultiplier +template <> +void solveConstraints<true, 2>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef); + __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff); + __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa); + __m256 multiplier = _mm256_permute_ps(stiffness, 0x55); + stiffness = _mm256_permute_ps(stiffness, 0x00); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vfmadd213ps ymm4, ymm4, ymm7 // e2ij + vfmadd213ps ymm2, ymm2, ymm4 + vfmadd213ps ymm0, ymm0, ymm2 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij + vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmovaps ymm0, stretchLimit // multiplier block + vmovaps ymm1, compressionLimit + vmovaps ymm2, multiplier + vminps ymm0, ymm0, ymm5 + vmaxps ymm1, ymm1, ymm0 + vfnmadd231ps ymm5, ymm1, ymm2 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} +#endif // _MSC_VER >= 1700 + +// clang-format:enable + +#else // _M_IX86 + +template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +#endif // _M_IX86 + +} // namespace avx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonCollision.cpp new file mode 100644 index 00000000..1ecaf277 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonCollision.cpp @@ -0,0 +1,34 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef __ARM_NEON__ +#error This file needs to be compiled with NEON support! +#endif + +#include "SwCollision.cpp" diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSelfCollision.cpp b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSelfCollision.cpp new file mode 100644 index 00000000..1a652711 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSelfCollision.cpp @@ -0,0 +1,34 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef __ARM_NEON__ +#error This file needs to be compiled with NEON support! +#endif + +#include "SwSelfCollision.cpp" diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSolverKernel.cpp b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSolverKernel.cpp new file mode 100644 index 00000000..fa193fc2 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/NeonSolverKernel.cpp @@ -0,0 +1,49 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef __ARM_NEON__ +#error This file needs to be compiled with NEON support! +#endif + +#include "SwSolverKernel.cpp" + +#include <cpu-features.h> + +namespace physx +{ +namespace cloth +{ +bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator, + IterationStateFactory& factory, PxProfileZone* profileZone) +{ + return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() && + (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true); +} +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/neon/SwCollisionHelpers.h new file mode 100644 index 00000000..6f1b0f58 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/SwCollisionHelpers.h @@ -0,0 +1,87 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifdef _M_ARM +#include <arm_neon.h> +#endif + +namespace physx +{ +namespace cloth +{ + +uint32_t findBitSet(uint32_t mask) +{ +#ifdef _M_ARM + __n64 t = { mask }; + return 31 - (vclz_u32(t)).n64_u32[0]; +#else + return 31 - __builtin_clz(mask); +#endif +} + +Simd4i intFloor(const Simd4f& v) +{ + int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31)); + return vsubq_s32(vcvtq_s32_f32(v.f4), neg); +} + +Simd4i horizontalOr(const Simd4i& mask) +{ + uint32x2_t hi = vget_high_u32(mask.u4); + uint32x2_t lo = vget_low_u32(mask.u4); + uint32x2_t tmp = vorr_u32(lo, hi); + uint32x2_t rev = vrev64_u32(tmp); + uint32x2_t res = vorr_u32(tmp, rev); + return vcombine_u32(res, res); +} + +Gather<Simd4i>::Gather(const Simd4i& index) +{ + PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack); + uint8x8x2_t lohiIndex = reinterpret_cast<const uint8x8x2_t&>(index); + byteIndex.val[0] = vtbl2_u8(lohiIndex, byteIndex.val[0]); + byteIndex.val[1] = vtbl2_u8(lohiIndex, byteIndex.val[1]); + mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2); + mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4); +} + +Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const +{ + PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute); + const uint8x8x4_t* table = reinterpret_cast<const uint8x8x4_t*>(ptr); + result.val[0] = vtbl4_u8(*table, result.val[0]); + result.val[1] = vtbl4_u8(*table, result.val[1]); + return reinterpret_cast<const Simd4i&>(result); +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/scalar/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/scalar/SwCollisionHelpers.h new file mode 100644 index 00000000..a5a0075f --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/scalar/SwCollisionHelpers.h @@ -0,0 +1,92 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +namespace physx +{ +namespace cloth +{ + +#if !NV_SIMD_SIMD +uint32_t findBitSet(uint32_t mask) +{ + uint32_t result = 0; + while(mask >>= 1) + ++result; + return result; +} +#endif + +inline Scalar4i intFloor(const Scalar4f& v) +{ + return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3]))); +} + +inline Scalar4i horizontalOr(const Scalar4i& mask) +{ + return simd4i(mask.i4[0] | mask.i4[1] | mask.i4[2] | mask.i4[3]); +} + +template <> +struct Gather<Scalar4i> +{ + inline Gather(const Scalar4i& index); + inline Scalar4i operator()(const Scalar4i*) const; + + Scalar4i mIndex; + Scalar4i mOutOfRange; +}; + +Gather<Scalar4i>::Gather(const Scalar4i& index) +{ + uint32_t mask = /* sGridSize */ 8 - 1; + + mIndex.u4[0] = index.u4[0] & mask; + mIndex.u4[1] = index.u4[1] & mask; + mIndex.u4[2] = index.u4[2] & mask; + mIndex.u4[3] = index.u4[3] & mask; + + mOutOfRange.i4[0] = index.u4[0] & ~mask ? 0 : -1; + mOutOfRange.i4[1] = index.u4[1] & ~mask ? 0 : -1; + mOutOfRange.i4[2] = index.u4[2] & ~mask ? 0 : -1; + mOutOfRange.i4[3] = index.u4[3] & ~mask ? 0 : -1; +} + +Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const +{ + const int32_t* base = ptr->i4; + const int32_t* index = mIndex.i4; + const int32_t* mask = mOutOfRange.i4; + return Scalar4i(base[index[0]] & mask[0], base[index[1]] & mask[1], base[index[2]] & mask[2], + base[index[3]] & mask[3]); +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h new file mode 100644 index 00000000..85e33c3c --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h @@ -0,0 +1,92 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#if PX_GCC_FAMILY +#include <xmmintrin.h> // _BitScanForward +#else +#pragma warning(push) +#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' +#include <intrin.h> // _BitScanForward +#pragma warning(pop) +#endif + +namespace physx +{ +namespace cloth +{ + +uint32_t findBitSet(uint32_t mask) +{ +#if defined(_MSC_VER) + unsigned long result; + _BitScanForward(&result, unsigned long(mask)); + return result; +#else + return __builtin_ffs(mask) - 1; +#endif +} + +Simd4i intFloor(const Simd4f& v) +{ + Simd4i i = _mm_cvttps_epi32(v); + return _mm_sub_epi32(i, _mm_srli_epi32(simd4i(v), 31)); +} + +Simd4i horizontalOr(const Simd4i& mask) +{ + Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y + return tmp | _mm_shuffle_epi32(tmp, 0x4e); // w z y x -> y x w z +} + +Gather<Simd4i>::Gather(const Simd4i& index) +{ + mSelectQ = _mm_srai_epi32(index << 29, 31); + mSelectD = _mm_srai_epi32(index << 30, 31); + mSelectW = _mm_srai_epi32(index << 31, 31); + mOutOfRange = (index ^ sIntSignBit) > sSignedMask; +} + +Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const +{ + // more efficient with _mm_shuffle_epi8 (SSSE3) + Simd4i lo = ptr[0], hi = ptr[1]; + Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo)); + Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo)); + Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi)); + Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi)); + Simd4i m0123 = select(mSelectD, m23, m01); + Simd4i m4567 = select(mSelectD, m67, m45); + return select(mSelectQ, m4567, m0123) & ~mOutOfRange; +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h new file mode 100644 index 00000000..cb141be5 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h @@ -0,0 +1,392 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +template <bool useMultiplier> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + + __m128 stretchLimit, compressionLimit, multiplier; + if(useMultiplier) + { + stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff); + compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa); + multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55); + } + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + for(; rIt != rEnd; rIt += 4, iIt += 8) + { + float* p0i = posIt + iIt[0] * 4; + float* p0j = posIt + iIt[1] * 4; + float* p1i = posIt + iIt[2] * 4; + float* p1j = posIt + iIt[3] * 4; + float* p2i = posIt + iIt[4] * 4; + float* p2j = posIt + iIt[5] * 4; + float* p3i = posIt + iIt[6] * 4; + float* p3j = posIt + iIt[7] * 4; + + __m128 v0i = _mm_load_ps(p0i); + __m128 v0j = _mm_load_ps(p0j); + __m128 v1i = _mm_load_ps(p1i); + __m128 v1j = _mm_load_ps(p1j); + __m128 v2i = _mm_load_ps(p2i); + __m128 v2j = _mm_load_ps(p2j); + __m128 v3i = _mm_load_ps(p3i); + __m128 v3j = _mm_load_ps(p3j); + + __m128 h0ij = _mm_add_ps(v0j, _mm_mul_ps(v0i, sMinusOneXYZOneW)); + __m128 h1ij = _mm_add_ps(v1j, _mm_mul_ps(v1i, sMinusOneXYZOneW)); + __m128 h2ij = _mm_add_ps(v2j, _mm_mul_ps(v2i, sMinusOneXYZOneW)); + __m128 h3ij = _mm_add_ps(v3j, _mm_mul_ps(v3i, sMinusOneXYZOneW)); + + __m128 a = _mm_unpacklo_ps(h0ij, h2ij); + __m128 b = _mm_unpackhi_ps(h0ij, h2ij); + __m128 c = _mm_unpacklo_ps(h1ij, h3ij); + __m128 d = _mm_unpackhi_ps(h1ij, h3ij); + + __m128 hxij = _mm_unpacklo_ps(a, c); + __m128 hyij = _mm_unpackhi_ps(a, c); + __m128 hzij = _mm_unpacklo_ps(b, d); + __m128 vwij = _mm_unpackhi_ps(b, d); + + __m128 rij = _mm_load_ps(rIt); + __m128 e2ij = _mm_add_ps(gSimd4fEpsilon, _mm_add_ps(_mm_mul_ps(hxij, hxij), + _mm_add_ps(_mm_mul_ps(hyij, hyij), _mm_mul_ps(hzij, hzij)))); + __m128 mask = _mm_cmpnle_ps(rij, gSimd4fEpsilon); + __m128 erij = _mm_and_ps(_mm_sub_ps(sOne, _mm_mul_ps(rij, _mm_rsqrt_ps(e2ij))), mask); + + if(useMultiplier) + { + erij = _mm_sub_ps(erij, _mm_mul_ps(multiplier, _mm_max_ps(compressionLimit, _mm_min_ps(erij, stretchLimit)))); + } + __m128 exij = _mm_mul_ps(erij, _mm_mul_ps(stiffness, _mm_rcp_ps(_mm_add_ps(gSimd4fEpsilon, vwij)))); + + __m128 exlo = _mm_and_ps(sMaskXY, exij); + __m128 exhi = _mm_andnot_ps(sMaskXY, exij); + + __m128 f0ij = _mm_mul_ps(h0ij, _mm_shuffle_ps(exlo, exlo, 0xc0)); + __m128 f1ij = _mm_mul_ps(h1ij, _mm_shuffle_ps(exlo, exlo, 0xd5)); + __m128 f2ij = _mm_mul_ps(h2ij, _mm_shuffle_ps(exhi, exhi, 0x2a)); + __m128 f3ij = _mm_mul_ps(h3ij, _mm_shuffle_ps(exhi, exhi, 0x3f)); + + __m128 u0i = _mm_add_ps(v0i, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0i, v0i, 0xff))); + __m128 u0j = _mm_sub_ps(v0j, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0j, v0j, 0xff))); + __m128 u1i = _mm_add_ps(v1i, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1i, v1i, 0xff))); + __m128 u1j = _mm_sub_ps(v1j, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1j, v1j, 0xff))); + __m128 u2i = _mm_add_ps(v2i, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2i, v2i, 0xff))); + __m128 u2j = _mm_sub_ps(v2j, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2j, v2j, 0xff))); + __m128 u3i = _mm_add_ps(v3i, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3i, v3i, 0xff))); + __m128 u3j = _mm_sub_ps(v3j, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3j, v3j, 0xff))); + + _mm_store_ps(p0i, u0i); + _mm_store_ps(p0j, u0j); + _mm_store_ps(p1i, u1i); + _mm_store_ps(p1j, u1j); + _mm_store_ps(p2i, u2i); + _mm_store_ps(p2j, u2j); + _mm_store_ps(p3i, u3i); + _mm_store_ps(p3j, u3j); + } +} + +#if PX_X86 + +// clang-format:disable + +// asm blocks in static condition blocks don't get removed, specialize +template <> +void solveConstraints<false>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + __m128 sEpsilon = gSimd4fEpsilon; + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + __m128 htmp[4]; + float* ptmp[8]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */ + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */ + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */ + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */ + + movaps xmm7, sMinusOneXYZOneW + mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */ + mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */ + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */ + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */ + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */ + + mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */ + mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */ + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + + unpcklps xmm0, xmm2 /* a */ + unpckhps xmm4, xmm2 /* b */ + unpcklps xmm1, xmm3 /* c */ + unpckhps xmm5, xmm3 /* d */ + + movaps xmm2, xmm0 + movaps xmm6, xmm4 + + unpcklps xmm0, xmm1 /* hxij */ + unpckhps xmm2, xmm1 /* hyij */ + unpcklps xmm4, xmm5 /* hzij */ + unpckhps xmm6, xmm5 /* vwij */ + + movaps xmm7, sEpsilon + movaps xmm5, sOne + movaps xmm3, stiffness + movaps xmm1, XMMWORD PTR [edx] /* rij */ + + mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */ + mulps xmm2, xmm2 __asm addps xmm0, xmm2 + mulps xmm4, xmm4 __asm addps xmm0, xmm4 + + rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */ + cmpnleps xmm1, xmm7 /* mask */ + subps xmm5, xmm0 __asm andps xmm5, xmm1 + addps xmm6, xmm7 __asm rcpps xmm6, xmm6 + + mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */ + + movaps xmm7, sMaskXY + andps xmm7, xmm6 /* exlo */ + xorps xmm6, xmm7 /* exhi */ + + movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */ + movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */ + movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */ + movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */ + + pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */ + pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */ + pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */ + pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */ + + mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + add eax, 16 + add edx, 16 + + cmp edx, esi + jb forBegin +forEnd: + } +} + +template <> +void solveConstraints<true>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + __m128 sEpsilon = gSimd4fEpsilon; + __m128 stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff); + __m128 compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa); + __m128 multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55); + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + __m128 htmp[4]; + float* ptmp[8]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */ + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */ + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */ + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */ + + movaps xmm7, sMinusOneXYZOneW + mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */ + mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */ + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */ + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */ + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */ + + mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */ + mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */ + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + + unpcklps xmm0, xmm2 /* a */ + unpckhps xmm4, xmm2 /* b */ + unpcklps xmm1, xmm3 /* c */ + unpckhps xmm5, xmm3 /* d */ + + movaps xmm2, xmm0 + movaps xmm6, xmm4 + + unpcklps xmm0, xmm1 /* hxij */ + unpckhps xmm2, xmm1 /* hyij */ + unpcklps xmm4, xmm5 /* hzij */ + unpckhps xmm6, xmm5 /* vwij */ + + movaps xmm7, sEpsilon + movaps xmm5, sOne + movaps xmm3, stiffness + movaps xmm1, XMMWORD PTR [edx] /* rij */ + + mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */ + mulps xmm2, xmm2 __asm addps xmm0, xmm2 + mulps xmm4, xmm4 __asm addps xmm0, xmm4 + + rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */ + cmpnleps xmm1, xmm7 /* mask */ + subps xmm5, xmm0 __asm andps xmm5, xmm1 + addps xmm6, xmm7 __asm rcpps xmm6, xmm6 + + movaps xmm0, stretchLimit /* multiplier block */ + movaps xmm1, compressionLimit + movaps xmm2, multiplier + minps xmm0, xmm5 + maxps xmm1, xmm0 + mulps xmm2, xmm1 + subps xmm5, xmm2 + + mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */ + + movaps xmm7, sMaskXY + andps xmm7, xmm6 /* exlo */ + xorps xmm6, xmm7 /* exhi */ + + movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */ + movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */ + movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */ + movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */ + + pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */ + pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */ + pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */ + pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */ + + mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + add eax, 16 + add edx, 16 + + cmp edx, esi + jb forBegin +forEnd: + } +} + +// clang-format:enable + +#endif diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h new file mode 100644 index 00000000..4f02de76 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h @@ -0,0 +1,225 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxMemory.h" + +#include "SwFactory.h" +#include "SwFabric.h" +#include "SwCloth.h" + +#include "ClothImpl.h" +#include "ClothBase.h" + +namespace physx +{ +namespace cloth +{ +class DxFactory; +class CuFactory; + +// make range from vector +template <typename T, typename A> +Range<T> makeRange(shdfnd::Array<T, A>& vec) +{ + T* ptr = vec.empty() ? 0 : vec.begin(); + return Range<T>(ptr, ptr + vec.size()); +} + +template <typename T, typename A> +Range<const T> makeRange(const shdfnd::Array<T, A>& vec) +{ + const T* ptr = vec.empty() ? 0 : vec.begin(); + return Range<const T>(ptr, ptr + vec.size()); +} + +// fabric conversion +template <typename SrcClothType, typename DstFactoryType> +typename DstFactoryType::FabricType* convertFabric(const SrcClothType& srcFabric, DstFactoryType& dstFactory) +{ + typedef typename DstFactoryType::FabricType DstFabricType; + + // see if dstFactory already has a Fabric with this id + DstFabricType* const* fIt = dstFactory.mFabrics.begin(); + DstFabricType* const* fEnd = dstFactory.mFabrics.end(); + for(; fIt != fEnd; ++fIt) + if((*fIt)->mId == srcFabric.mId) + return *fIt; // found id, return existing fabric + + // fabric does not exist so create a new one + Vector<uint32_t>::Type phases(srcFabric.getNumPhases()); + Vector<uint32_t>::Type sets(srcFabric.getNumSets()); + Vector<float>::Type restvalues(srcFabric.getNumRestvalues()); + Vector<uint32_t>::Type indices(srcFabric.getNumIndices()); + Vector<uint32_t>::Type anchors(srcFabric.getNumTethers()); + Vector<float>::Type tetherLengths(srcFabric.getNumTethers()); + Vector<uint32_t>::Type triangles(srcFabric.getNumTriangles() * 3); + + Range<uint32_t> phaseRange = makeRange(phases); + Range<float> restvalueRange = makeRange(restvalues); + Range<uint32_t> setRange = makeRange(sets); + Range<uint32_t> indexRange = makeRange(indices); + Range<uint32_t> anchorRange = makeRange(anchors); + Range<float> lengthRange = makeRange(tetherLengths); + Range<uint32_t> triangleRange = makeRange(triangles); + + srcFabric.mFactory.extractFabricData(srcFabric, phaseRange, setRange, restvalueRange, indexRange, anchorRange, + lengthRange, triangleRange); + + DstFabricType* dstFabric = + static_cast<DstFabricType*>(dstFactory.createFabric(srcFabric.mNumParticles, phaseRange, setRange, restvalueRange, + indexRange, anchorRange, lengthRange, triangleRange)); + + // give new fabric the same id as the source so it can be matched + dstFabric->mId = srcFabric.mId; + + return dstFabric; +} + +inline Range<const PhaseConfig> getPhaseConfigs(const SwCloth& cloth) +{ + return makeRange(cloth.mPhaseConfigs); +} +inline void setPhaseConfigs(SwCloth& cloth, Range<const PhaseConfig> phaseConfigs) +{ + cloth.mPhaseConfigs.assign(phaseConfigs.begin(), phaseConfigs.end()); +} +inline Range<const PxVec4> getParticleAccelerations(const SwCloth& cloth) +{ + return makeRange(cloth.mParticleAccelerations); +} +inline Range<const uint32_t> getSelfCollisionIndices(const SwCloth& cloth) +{ + return makeRange(cloth.mSelfCollisionIndices); +} + +// cloth conversion +template <typename DstFactoryType, typename SrcImplType> +typename DstFactoryType::ImplType* convertCloth(DstFactoryType& dstFactory, const SrcImplType& srcImpl) +{ + typedef typename DstFactoryType::FabricType DstFabricType; + typedef typename DstFactoryType::ImplType DstImplType; + typedef typename DstImplType::ClothType DstClothType; + typedef typename SrcImplType::ClothType SrcClothType; + + const SrcClothType& srcCloth = srcImpl.mCloth; + const Factory& srcFactory = srcCloth.mFactory; + + typename DstClothType::ContextLockType dstLock(dstFactory); + typename SrcClothType::ContextLockType srcLock(srcCloth.mFactory); + + // particles + MappedRange<const PxVec4> curParticles = srcImpl.getCurrentParticles(); + + // fabric + DstFabricType& dstFabric = *convertFabric(srcCloth.mFabric, dstFactory); + + // create new cloth + DstImplType* dstImpl = static_cast<DstImplType*>(dstFactory.createCloth(curParticles, dstFabric)); + DstClothType& dstCloth = dstImpl->mCloth; + + // copy across common parameters + copy(dstCloth, srcCloth); + + // copy across previous particles + MappedRange<const PxVec4> prevParticles = srcImpl.getPreviousParticles(); + PxMemCopy(dstImpl->getPreviousParticles().begin(), prevParticles.begin(), prevParticles.size() * sizeof(PxVec4)); + + // copy across transformed phase configs + setPhaseConfigs(dstCloth, getPhaseConfigs(srcCloth)); + + // collision data + Vector<PxVec4>::Type spheres(srcImpl.getNumSpheres(), PxVec4(0.0f)); + PxVec4* spherePtr = spheres.empty() ? 0 : &spheres.front(); + Range<PxVec4> sphereRange(spherePtr, spherePtr + spheres.size()); + Vector<uint32_t>::Type capsules(srcImpl.getNumCapsules() * 2); + Range<uint32_t> capsuleRange = makeRange(capsules); + Vector<PxVec4>::Type planes(srcImpl.getNumPlanes(), PxVec4(0.0f)); + PxVec4* planePtr = planes.empty() ? 0 : &planes.front(); + Range<PxVec4> planeRange(planePtr, planePtr + planes.size()); + Vector<uint32_t>::Type convexes(srcImpl.getNumConvexes()); + Range<uint32_t> convexRange = makeRange(convexes); + Vector<PxVec3>::Type triangles(srcImpl.getNumTriangles() * 3, PxVec3(0.0f)); + PxVec3* trianglePtr = triangles.empty() ? 0 : &triangles.front(); + Range<PxVec3> triangleRange(trianglePtr, trianglePtr + triangles.size()); + + srcFactory.extractCollisionData(srcImpl, sphereRange, capsuleRange, planeRange, convexRange, triangleRange); + dstImpl->setSpheres(sphereRange, 0, 0); + dstImpl->setCapsules(capsuleRange, 0, 0); + dstImpl->setPlanes(planeRange, 0, 0); + dstImpl->setConvexes(convexRange, 0, 0); + dstImpl->setTriangles(triangleRange, 0, 0); + + // motion constraints, copy directly into new cloth buffer + if(srcImpl.getNumMotionConstraints()) + srcFactory.extractMotionConstraints(srcImpl, dstImpl->getMotionConstraints()); + + // separation constraints, copy directly into new cloth buffer + if(srcImpl.getNumSeparationConstraints()) + srcFactory.extractSeparationConstraints(srcImpl, dstImpl->getSeparationConstraints()); + + // particle accelerations + if(srcImpl.getNumParticleAccelerations()) + { + Range<const PxVec4> accelerations = getParticleAccelerations(srcCloth); + PxMemCopy(dstImpl->getParticleAccelerations().begin(), accelerations.begin(), + accelerations.size() * sizeof(PxVec4)); + } + + // self-collision indices + dstImpl->setSelfCollisionIndices(getSelfCollisionIndices(srcCloth)); + + // rest positions + Vector<PxVec4>::Type restPositions(srcImpl.getNumRestPositions()); + srcFactory.extractRestPositions(srcImpl, makeRange(restPositions)); + dstImpl->setRestPositions(makeRange(restPositions)); + + // virtual particles + if(srcImpl.getNumVirtualParticles()) + { + Vector<Vec4u>::Type indices(srcImpl.getNumVirtualParticles()); + Vector<PxVec3>::Type weights(srcImpl.getNumVirtualParticleWeights(), PxVec3(0.0f)); + + uint32_t(*indicesPtr)[4] = indices.empty() ? 0 : &array(indices.front()); + Range<uint32_t[4]> indicesRange(indicesPtr, indicesPtr + indices.size()); + + PxVec3* weightsPtr = weights.empty() ? 0 : &weights.front(); + Range<PxVec3> weightsRange(weightsPtr, weightsPtr + weights.size()); + + srcFactory.extractVirtualParticles(srcImpl, indicesRange, weightsRange); + + dstImpl->setVirtualParticles(indicesRange, weightsRange); + } + + return dstImpl; +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h new file mode 100644 index 00000000..b9ae0a53 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h @@ -0,0 +1,45 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <cuda.h> +#include <driver_types.h> + +namespace physx +{ +namespace cloth +{ +// implemented in CuFactory.cpp +void checkSuccessImpl(CUresult, const char*, const int); +} + +// safe cuda calls +#define checkSuccess(err) cloth::checkSuccessImpl(err, __FILE__, __LINE__) +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp new file mode 100644 index 00000000..6ecd1aeb --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp @@ -0,0 +1,511 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "CuCloth.h" +#include "CuFabric.h" +#include "CuFactory.h" +#include "CuContextLock.h" +#include "CuCheckSuccess.h" +#include "CuClothData.h" +#include "CuSolver.h" +#include "TripletScheduler.h" +#include "ClothBase.h" +#include "Array.h" +#include "PsFoundation.h" + +#if PX_VC +#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch +#endif + +namespace physx +{ +namespace cloth +{ +PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp +} +} + +using namespace physx; + +namespace +{ +bool isSelfCollisionEnabled(const cloth::CuCloth& cloth) +{ + return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f; +} +} + +cloth::CuCloth::CuCloth(CuFactory& factory, CuFabric& fabric, Range<const PxVec4> particles) +: CuContextLock(factory) +, mFactory(factory) +, mFabric(fabric) +, mClothDataDirty(false) +, mNumParticles(uint32_t(particles.size())) +, mParticles(mFactory.mContextManager) +, mParticlesHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped)) +, mDeviceParticlesDirty(false) +, mHostParticlesDirty(true) +, mPhaseConfigs(mFactory.mContextManager) +, mMotionConstraints(mFactory.mContextManager) +, mSeparationConstraints(mFactory.mContextManager) +, mParticleAccelerations(mFactory.mContextManager) +, mParticleAccelerationsHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped)) +, mCapsuleIndices(getMappedAllocator<IndexPair>(mFactory.mContextManager)) +, mStartCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mTargetCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mConvexMasks(getMappedAllocator<uint32_t>(mFactory.mContextManager)) +, mStartCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mTargetCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mStartCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager)) +, mTargetCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager)) +, mVirtualParticleSetSizes(mFactory.mContextManager) +, mVirtualParticleIndices(mFactory.mContextManager) +, mVirtualParticleWeights(mFactory.mContextManager) +, mRestPositions(mFactory.mContextManager) +, mSelfCollisionIndices(mFactory.mContextManager) +, mSelfCollisionData(mFactory.mContextManager) +, mSharedMemorySize(0) +, mUserData(0) +{ + PX_ASSERT(!particles.empty()); + + initialize(*this, particles.begin(), particles.end()); + + mParticles.reserve(2 * mNumParticles); + mParticles.push_back(particles.begin(), particles.end()); + mParticles.push_back(particles.begin(), particles.end()); + mParticlesHostCopy.resizeUninitialized(2 * mNumParticles); + + mFabric.incRefCount(); + + CuContextLock::release(); +} + +cloth::CuCloth::CuCloth(CuFactory& factory, const CuCloth& cloth) +: CuContextLock(factory) +, mFactory(factory) +, mFabric(cloth.mFabric) +, mNumParticles(cloth.mNumParticles) +, mParticles(cloth.mParticles) +, mParticlesHostCopy(cloth.mParticlesHostCopy) +, mDeviceParticlesDirty(cloth.mDeviceParticlesDirty) +, mHostParticlesDirty(cloth.mHostParticlesDirty) +, mPhaseConfigs(cloth.mPhaseConfigs) +, mHostPhaseConfigs(cloth.mHostPhaseConfigs) +, mMotionConstraints(cloth.mMotionConstraints) +, mSeparationConstraints(cloth.mSeparationConstraints) +, mParticleAccelerations(cloth.mParticleAccelerations) +, mParticleAccelerationsHostCopy(cloth.mParticleAccelerationsHostCopy) +, mCapsuleIndices(cloth.mCapsuleIndices) +, mStartCollisionSpheres(cloth.mStartCollisionSpheres) +, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres) +, mStartCollisionPlanes(cloth.mStartCollisionPlanes) +, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes) +, mStartCollisionTriangles(cloth.mStartCollisionTriangles) +, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles) +, mVirtualParticleSetSizes(cloth.mVirtualParticleSetSizes) +, mVirtualParticleIndices(cloth.mVirtualParticleIndices) +, mVirtualParticleWeights(cloth.mVirtualParticleWeights) +, mRestPositions(cloth.mRestPositions) +, mSelfCollisionIndices(cloth.mSelfCollisionIndices) +, mSelfCollisionData(mFactory.mContextManager) +, mSharedMemorySize(cloth.mSharedMemorySize) +, mUserData(cloth.mUserData) +{ + copy(*this, cloth); + + mFabric.incRefCount(); + + CuContextLock::release(); +} + +cloth::CuCloth::~CuCloth() +{ + CuContextLock::acquire(); + + mFabric.decRefCount(); +} + +void cloth::CuCloth::notifyChanged() +{ + mClothDataDirty = true; +} + +bool cloth::CuCloth::updateClothData(CuClothData& clothData) +{ + // test particle pointer to detect when cloth data array has been reordered + if(!mClothDataDirty && clothData.mParticles == array(*mParticles.begin().get())) + { + PX_ASSERT(mSharedMemorySize == getSharedMemorySize()); + return false; + } + + mSharedMemorySize = getSharedMemorySize(); + + if(mSelfCollisionData.empty() && isSelfCollisionEnabled(*this)) + { + uint32_t numSelfCollisionIndices = + mSelfCollisionIndices.empty() ? mNumParticles : uint32_t(mSelfCollisionIndices.size()); + + uint32_t particleSize = 4 * mNumParticles; + uint32_t keySize = 2 * numSelfCollisionIndices; // 2x for radix buffer + uint32_t cellStartSize = (129 + 128 * 128 + 130) / 2 + 1; // half because type is int16_t + + // use 16bit indices for cellStart array (128x128 grid) + mSelfCollisionData.resize(particleSize + keySize + cellStartSize); + checkSuccess(cuMemsetD32((mSelfCollisionData.begin() + particleSize + keySize).dev(), 0xffffffff, cellStartSize)); + } + + clothData = CuClothData(*this); + mClothDataDirty = false; + + return true; +} + +uint32_t cloth::CuCloth::getSharedMemorySize() const +{ + uint32_t numPhases = uint32_t(mPhaseConfigs.size()); + uint32_t numSpheres = uint32_t(mStartCollisionSpheres.size()); + uint32_t numCones = uint32_t(mCapsuleIndices.size()); + uint32_t numPlanes = uint32_t(mStartCollisionPlanes.size()); + uint32_t numConvexes = uint32_t(mConvexMasks.size()); + uint32_t numTriangles = uint32_t(mStartCollisionTriangles.size() / 3); + + uint32_t phaseConfigSize = numPhases * sizeof(CuPhaseConfig); + + bool storePrevCollisionData = mEnableContinuousCollision || mFriction > 0.0f; + uint32_t continuousCollisionSize = storePrevCollisionData ? 4 * numSpheres + 10 * numCones : 0; + continuousCollisionSize += 4 * numCones + numConvexes; // capsule and convex masks + uint32_t discreteCollisionSize = 4 * numSpheres + PxMax(10 * numCones + 96, 208u); + discreteCollisionSize = PxMax(discreteCollisionSize, PxMax(4 * numPlanes, 19 * numTriangles)); + + // scratch memory for prefix sum and histogram + uint32_t selfCollisionSize = isSelfCollisionEnabled(*this) ? 544 : 0; + + // see CuSolverKenel.cu::gSharedMemory comment for details + return phaseConfigSize + sizeof(float) * (continuousCollisionSize + PxMax(selfCollisionSize, discreteCollisionSize)); +} + +void cloth::CuCloth::setPhaseConfig(Range<const PhaseConfig> configs) +{ + mHostPhaseConfigs.assign(configs.begin(), configs.end()); + + Vector<CuPhaseConfig>::Type deviceConfigs; + deviceConfigs.reserve(configs.size()); + const PhaseConfig* cEnd = configs.end(); + for(const PhaseConfig* cIt = configs.begin(); cIt != cEnd; ++cIt) + { + CuPhaseConfig config; + + config.mStiffness = cIt->mStiffness; + config.mStiffnessMultiplier = cIt->mStiffnessMultiplier; + config.mCompressionLimit = cIt->mCompressionLimit; + config.mStretchLimit = cIt->mStretchLimit; + + uint16_t phaseIndex = cIt->mPhaseIndex; + config.mNumConstraints = mFabric.mNumConstraintsInPhase[phaseIndex]; + config.mRestvalues = mFabric.mRestvaluesInPhase[phaseIndex].get(); + config.mIndices = mFabric.mIndicesInPhase[phaseIndex].get(); + + deviceConfigs.pushBack(config); + } + + CuContextLock contextLock(mFactory); + mPhaseConfigs.assign(deviceConfigs.begin(), deviceConfigs.end()); +} + +cloth::Range<PxVec4> cloth::CuCloth::push(cloth::CuConstraints& constraints) +{ + if(!constraints.mTarget.capacity()) + { + CuContextLock contextLock(mFactory); + constraints.mTarget.reserve(mNumParticles); + } + if(constraints.mHostCopy.empty()) + constraints.mTarget.resize(mNumParticles); + + if(constraints.mStart.empty()) // initialize start first + constraints.mStart.swap(constraints.mTarget); + + if(!constraints.mHostCopy.capacity()) + { + CuContextLock contextLock(mFactory); + constraints.mHostCopy.reserve(mNumParticles); + } + constraints.mHostCopy.resizeUninitialized(mNumParticles); + + PxVec4* data = &constraints.mHostCopy.front(); + return Range<PxVec4>(data, data + constraints.mHostCopy.size()); +} + +void cloth::CuCloth::clear(cloth::CuConstraints& constraints) +{ + CuContextLock contextLock(mFactory); + CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mStart); + CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mTarget); +} + +void cloth::CuCloth::syncDeviceParticles() +{ + if(mDeviceParticlesDirty) + { + CuContextLock contextLock(mFactory); + checkSuccess( + cuMemcpyHtoD(mParticles.begin().dev(), mParticlesHostCopy.begin(), 2 * mNumParticles * sizeof(PxVec4))); + mDeviceParticlesDirty = false; + } +} + +void cloth::CuCloth::syncHostParticles() +{ + if(mHostParticlesDirty) + { + CuContextLock contextLock(mFactory); + const PxVec4* src = mParticles.begin().get(); + mFactory.copyToHost(src, src + 2 * mNumParticles, mParticlesHostCopy.begin()); + mHostParticlesDirty = false; + } +} + +cloth::Range<const PxVec3> cloth::CuCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t replaceSize) +{ + // clamp to 500 triangles (1500 vertices) to prevent running out of shared memory + uint32_t removedSize = mStartCollisionTriangles.size() - replaceSize; + const PxVec3* clamp = range.begin() + 1500 - removedSize; + + if(range.end() > clamp) + { + shdfnd::getFoundation().error(PX_WARN, "Too many collision " + "triangles specified for cloth, dropping all but first 500.\n"); + } + + return Range<const PxVec3>(range.begin(), PxMin(range.end(), clamp)); +} + +#include "ClothImpl.h" + +namespace physx +{ +namespace cloth +{ + +// ClothImpl<CuCloth>::clone() implemented in CuClothClone.cpp + +template <> +uint32_t ClothImpl<CuCloth>::getNumParticles() const +{ + return mCloth.mNumParticles; +} + +template <> +void ClothImpl<CuCloth>::lockParticles() const +{ + const_cast<CuCloth&>(mCloth).syncHostParticles(); +} + +template <> +void ClothImpl<CuCloth>::unlockParticles() const +{ +} + +template <> +MappedRange<PxVec4> ClothImpl<CuCloth>::getCurrentParticles() +{ + mCloth.wakeUp(); + lockParticles(); + mCloth.mDeviceParticlesDirty = true; + return getMappedParticles(mCloth.mParticlesHostCopy.begin()); +} + +template <> +MappedRange<const PxVec4> ClothImpl<CuCloth>::getCurrentParticles() const +{ + lockParticles(); + return getMappedParticles(mCloth.mParticlesHostCopy.begin()); +} + +template <> +MappedRange<PxVec4> ClothImpl<CuCloth>::getPreviousParticles() +{ + mCloth.wakeUp(); + lockParticles(); + mCloth.mDeviceParticlesDirty = true; + return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles); +} + +template <> +MappedRange<const PxVec4> ClothImpl<CuCloth>::getPreviousParticles() const +{ + lockParticles(); + return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles); +} + +template <> +GpuParticles ClothImpl<CuCloth>::getGpuParticles() +{ + mCloth.syncDeviceParticles(); + mCloth.mHostParticlesDirty = true; + PxVec4* particles = mCloth.mParticles.begin().get(); + GpuParticles result = { particles, particles + mCloth.mNumParticles, 0 }; + return result; +} + +template <> +void ClothImpl<CuCloth>::setPhaseConfig(Range<const PhaseConfig> configs) +{ + Vector<PhaseConfig>::Type transformedConfigs; + transformedConfigs.reserve(configs.size()); + + // transform phase config to use in solver + for(; !configs.empty(); configs.popFront()) + if(configs.front().mStiffness > 0.0f) + transformedConfigs.pushBack(transform(configs.front())); + + mCloth.setPhaseConfig(Range<const PhaseConfig>(transformedConfigs.begin(), transformedConfigs.end())); + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <> +void ClothImpl<CuCloth>::setSelfCollisionIndices(Range<const uint32_t> indices) +{ + ContextLockType lock(mCloth.mFactory); + mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end()); + mCloth.mSelfCollisionIndicesHost.assign(indices.begin(), indices.end()); + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <> +uint32_t ClothImpl<CuCloth>::getNumVirtualParticles() const +{ + return uint32_t(mCloth.mVirtualParticleIndices.size()); +} + +template <> +Range<PxVec4> ClothImpl<CuCloth>::getParticleAccelerations() +{ + if(mCloth.mParticleAccelerations.empty()) + { + CuContextLock contextLock(mCloth.mFactory); + mCloth.mParticleAccelerations.resize(mCloth.mNumParticles); + } + + if(!mCloth.mParticleAccelerationsHostCopy.capacity()) + { + CuContextLock contextLock(mCloth.mFactory); + mCloth.mParticleAccelerationsHostCopy.reserve(mCloth.mNumParticles); + } + mCloth.mParticleAccelerationsHostCopy.resizeUninitialized(mCloth.mNumParticles); + + mCloth.wakeUp(); + + PxVec4* data = mCloth.mParticleAccelerationsHostCopy.begin(); + return Range<PxVec4>(data, mCloth.mParticleAccelerationsHostCopy.end()); +} + +template <> +void ClothImpl<CuCloth>::clearParticleAccelerations() +{ + CuContextLock contextLock(mCloth.mFactory); + CuDeviceVector<PxVec4>(mCloth.mFactory.mContextManager).swap(mCloth.mParticleAccelerations); + mCloth.mParticleAccelerationsHostCopy.reset(); + mCloth.wakeUp(); +} + +namespace +{ +uint32_t calculateNumReplays(const Vector<Vec4u>::Type& triplets, const Vector<uint32_t>::Type setSizes) +{ + uint32_t result = 0; + + Vector<Vec4u>::Type::ConstIterator tIt = triplets.begin(); + Vector<uint32_t>::Type::ConstIterator sIt, sEnd = setSizes.end(); + uint32_t index = 0; + for(sIt = setSizes.begin(); sIt != sEnd; ++sIt, ++index) + { + Vector<Vec4u>::Type::ConstIterator tEnd = tIt + *sIt, tLast = tIt; + while(tLast != tEnd) + { + uint8_t numConflicts[3][32] = {}; + uint8_t numReplays[3] = {}; + + for(tLast += PxMin(ptrdiff_t(32), tEnd - tLast); tIt != tLast; ++tIt) + for(int i = 0; i < 3; ++i) + numReplays[i] = PxMax(numReplays[i], ++numConflicts[i][(*tIt)[i] & 31]); + + result += numReplays[0] + numReplays[1] + numReplays[2]; + } + } + + return result; +} +} + +template <> +void ClothImpl<CuCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights) +{ + // shuffle indices to form independent SIMD sets + TripletScheduler scheduler(indices); + scheduler.warp(mCloth.mNumParticles, 32); + + // convert to 16bit indices + Vector<Vec4us>::Type hostIndices; + hostIndices.reserve(indices.size()); + TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(); + TripletScheduler::ConstTripletIter tEnd = scheduler.mTriplets.end(); + for(; tIt != tEnd; ++tIt) + hostIndices.pushBack(Vec4us(*tIt)); + + // printf("num sets = %u, num replays = %u\n", scheduler.mSetSizes.size(), + // calculateNumReplays(scheduler.mTriplets, scheduler.mSetSizes)); + + // add normalization weight + Vector<PxVec4>::Type hostWeights; + hostWeights.reserve(weights.size()); + for(; !weights.empty(); weights.popFront()) + { + PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front()); + PxReal scale = 1 / w.magnitudeSquared(); + hostWeights.pushBack(PxVec4(w.x, w.y, w.z, scale)); + } + + CuContextLock contextLock(mCloth.mFactory); + + // todo: 'swap' these to force reallocation? + mCloth.mVirtualParticleIndices = hostIndices; + mCloth.mVirtualParticleSetSizes = scheduler.mSetSizes; + mCloth.mVirtualParticleWeights = hostWeights; + + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h new file mode 100644 index 00000000..257d490c --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h @@ -0,0 +1,216 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxTransform.h" +#include "foundation/PxVec4.h" +#include "Range.h" +#include "PhaseConfig.h" +#include "MovingAverage.h" +#include "IndexPair.h" +#include "BoundingBox.h" +#include "Vec4T.h" +#include "CuPhaseConfig.h" +#include "CuPinnedAllocator.h" +#include "CuContextLock.h" +#include "CuDeviceVector.h" + +namespace physx +{ +namespace cloth +{ + +class CuFabric; +class CuFactory; +struct CuClothData; + +struct CuConstraints +{ + CuConstraints(physx::PxCudaContextManager* ctx) + : mStart(ctx), mTarget(ctx), mHostCopy(CuHostAllocator(ctx, cudaHostAllocMapped)) + { + } + + void pop() + { + if(!mTarget.empty()) + { + mStart.swap(mTarget); + mTarget.resize(0); + } + } + + CuDeviceVector<PxVec4> mStart; + CuDeviceVector<PxVec4> mTarget; + CuPinnedVector<PxVec4>::Type mHostCopy; +}; + +class CuCloth : protected CuContextLock +{ + public: + CuCloth& operator=(const CuCloth&); + typedef CuFactory FactoryType; + typedef CuFabric FabricType; + typedef CuContextLock ContextLockType; + + typedef CuPinnedVector<PxVec4>::Type& MappedVec4fVectorType; + typedef CuPinnedVector<IndexPair>::Type& MappedIndexVectorType; + + CuCloth(CuFactory&, CuFabric&, Range<const PxVec4>); + CuCloth(CuFactory&, const CuCloth&); + ~CuCloth(); // not virtual on purpose + + public: + bool isSleeping() const + { + return mSleepPassCounter >= mSleepAfterCount; + } + void wakeUp() + { + mSleepPassCounter = 0; + } + + void notifyChanged(); + + bool updateClothData(CuClothData&); // expects acquired context + uint32_t getSharedMemorySize() const; // without particle data + + // expects transformed configs, doesn't call notifyChanged() + void setPhaseConfig(Range<const PhaseConfig>); + + Range<PxVec4> push(CuConstraints&); + void clear(CuConstraints&); + + void syncDeviceParticles(); + void syncHostParticles(); + + Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t); + + public: + CuFactory& mFactory; + CuFabric& mFabric; + + bool mClothDataDirty; + + // particle data + uint32_t mNumParticles; + CuDeviceVector<PxVec4> mParticles; // cur, prev + CuPinnedVector<PxVec4>::Type mParticlesHostCopy; + bool mDeviceParticlesDirty; + bool mHostParticlesDirty; + + PxVec3 mParticleBoundsCenter; + PxVec3 mParticleBoundsHalfExtent; + + PxVec3 mGravity; + PxVec3 mLogDamping; + PxVec3 mLinearLogDrag; + PxVec3 mAngularLogDrag; + PxVec3 mLinearInertia; + PxVec3 mAngularInertia; + PxVec3 mCentrifugalInertia; + float mSolverFrequency; + float mStiffnessFrequency; + + PxTransform mTargetMotion; + PxTransform mCurrentMotion; + PxVec3 mLinearVelocity; + PxVec3 mAngularVelocity; + + float mPrevIterDt; + MovingAverage mIterDtAvg; + + CuDeviceVector<CuPhaseConfig> mPhaseConfigs; // transformed! + Vector<PhaseConfig>::Type mHostPhaseConfigs; // transformed! + + // tether constraints stuff + float mTetherConstraintLogStiffness; + float mTetherConstraintScale; + + // motion constraints stuff + CuConstraints mMotionConstraints; + float mMotionConstraintScale; + float mMotionConstraintBias; + float mMotionConstraintLogStiffness; + + // separation constraints stuff + CuConstraints mSeparationConstraints; + + // particle acceleration stuff + CuDeviceVector<PxVec4> mParticleAccelerations; + CuPinnedVector<PxVec4>::Type mParticleAccelerationsHostCopy; + + // wind + PxVec3 mWind; + float mDragLogCoefficient; + float mLiftLogCoefficient; + + // collision stuff + CuPinnedVector<IndexPair>::Type mCapsuleIndices; + CuPinnedVector<PxVec4>::Type mStartCollisionSpheres; + CuPinnedVector<PxVec4>::Type mTargetCollisionSpheres; + CuPinnedVector<uint32_t>::Type mConvexMasks; + CuPinnedVector<PxVec4>::Type mStartCollisionPlanes; + CuPinnedVector<PxVec4>::Type mTargetCollisionPlanes; + CuPinnedVector<PxVec3>::Type mStartCollisionTriangles; + CuPinnedVector<PxVec3>::Type mTargetCollisionTriangles; + bool mEnableContinuousCollision; + float mCollisionMassScale; + float mFriction; + + // virtual particles + CuDeviceVector<uint32_t> mVirtualParticleSetSizes; + CuDeviceVector<Vec4us> mVirtualParticleIndices; + CuDeviceVector<PxVec4> mVirtualParticleWeights; + + // self collision + float mSelfCollisionDistance; + float mSelfCollisionLogStiffness; + + CuDeviceVector<PxVec4> mRestPositions; + CuDeviceVector<uint32_t> mSelfCollisionIndices; + Vector<uint32_t>::Type mSelfCollisionIndicesHost; + + // 4 (position) + 2 (key) per particle + cellStart (8322) + CuDeviceVector<float> mSelfCollisionData; + + // sleeping (see SwCloth for comments) + uint32_t mSleepTestInterval; + uint32_t mSleepAfterCount; + float mSleepThreshold; + uint32_t mSleepPassCounter; + uint32_t mSleepTestCounter; + + uint32_t mSharedMemorySize; + + void* mUserData; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp new file mode 100644 index 00000000..8b234968 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp @@ -0,0 +1,83 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "ClothClone.h" + +#include "CuFactory.h" +#include "CuFabric.h" +#include "CuCloth.h" + +namespace physx +{ +namespace cloth +{ +Range<const PhaseConfig> getPhaseConfigs(const CuCloth& cloth) +{ + return makeRange(cloth.mHostPhaseConfigs); +} +void setPhaseConfigs(CuCloth& cloth, Range<const PhaseConfig> phaseConfigs) +{ + cloth.setPhaseConfig(phaseConfigs); +} +Range<const PxVec4> getParticleAccelerations(const CuCloth& cloth) +{ + return makeRange(cloth.mParticleAccelerationsHostCopy); +} +Range<const uint32_t> getSelfCollisionIndices(const CuCloth& cloth) +{ + return makeRange(cloth.mSelfCollisionIndicesHost); +} + +template <> +Cloth* ClothImpl<CuCloth>::clone(Factory& factory) const +{ + if(&mCloth.mFactory == &factory) + return new ClothImpl<CuCloth>(factory, *this); // copy construct directly + + switch(factory.getPlatform()) + { + case Factory::CPU: + return convertCloth(static_cast<SwFactory&>(factory), *this); + case Factory::CUDA: + return convertCloth(static_cast<CuFactory&>(factory), *this); + default: + return NULL; + } +} + +Cloth* CuFactory::clone(const Cloth& cloth) +{ + if(cloth.getFactory().getPlatform() == Factory::CPU) + return convertCloth(*this, static_cast<const SwClothImpl&>(cloth)); + + return cloth.clone(*this); +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp new file mode 100644 index 00000000..5a1485c6 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp @@ -0,0 +1,238 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "CuClothData.h" +#include "CuCloth.h" +#include "CuFabric.h" +#include "CuCheckSuccess.h" +#include "CuContextLock.h" +#include "IterationState.h" + +using namespace physx; + +cloth::CuClothData::CuClothData(CuCloth& cloth) +{ + mNumParticles = cloth.mNumParticles; + mParticles = array(*cloth.mParticles.begin().get()); + + mParticlesHostCopy = array(*getDevicePointer(cloth.mParticlesHostCopy)); + + mNumPhases = uint32_t(cloth.mPhaseConfigs.size()); + mPhaseConfigs = cloth.mPhaseConfigs.begin().get(); + + mTethers = cloth.mFabric.mTethers.begin().get(); + mNumTethers = uint32_t(cloth.mFabric.mTethers.size()); + mTetherConstraintScale = cloth.mTetherConstraintScale * cloth.mFabric.mTetherLengthScale; + + mTriangles = cloth.mFabric.mTriangles.begin().get(); + mNumTriangles = uint32_t(cloth.mFabric.mTriangles.size()) / 3; + + mMotionConstraintScale = cloth.mMotionConstraintScale; + mMotionConstraintBias = cloth.mMotionConstraintBias; + + mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size()); + mNumCapsules = uint32_t(cloth.mCapsuleIndices.size()); + mCapsuleIndices = getDevicePointer(cloth.mCapsuleIndices); + + mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size()); + mNumConvexes = uint32_t(cloth.mConvexMasks.size()); + mConvexMasks = getDevicePointer(cloth.mConvexMasks); + + mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3; + + mVirtualParticleSetSizesBegin = cloth.mVirtualParticleSetSizes.begin().get(); + mVirtualParticleSetSizesEnd = mVirtualParticleSetSizesBegin + cloth.mVirtualParticleSetSizes.size(); + mVirtualParticleIndices = array(*cloth.mVirtualParticleIndices.begin().get()); + mVirtualParticleWeights = array(*cloth.mVirtualParticleWeights.begin().get()); + + mEnableContinuousCollision = cloth.mEnableContinuousCollision; + mCollisionMassScale = cloth.mCollisionMassScale; + mFrictionScale = cloth.mFriction; + + mSelfCollisionDistance = cloth.mSelfCollisionDistance; + mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin().get(); + mNumSelfCollisionIndices = mSelfCollisionIndices ? uint32_t(cloth.mSelfCollisionIndices.size()) : mNumParticles; + + if(!cloth.mSelfCollisionData.empty()) + { + uint32_t keySize = 2 * mNumSelfCollisionIndices; + uint32_t particleSize = 4 * mNumParticles; + + mSelfCollisionParticles = cloth.mSelfCollisionData.begin().get(); + mSelfCollisionKeys = (uint32_t*)(mSelfCollisionParticles + particleSize); + mSelfCollisionCellStart = (uint16_t*)(mSelfCollisionKeys + keySize); + } + else + { + mSelfCollisionParticles = 0; + mSelfCollisionKeys = 0; + mSelfCollisionCellStart = 0; + } + + mSleepTestInterval = cloth.mSleepTestInterval; + mSleepAfterCount = cloth.mSleepAfterCount; + mSleepThreshold = cloth.mSleepThreshold; +} + +cloth::CuFrameData::CuFrameData(CuCloth& cloth, uint32_t numSharedPositions, const IterationState<Simd4f>& state, + const CuIterationData* iterationData) +{ + mDeviceParticlesDirty = cloth.mDeviceParticlesDirty; + + mNumSharedPositions = numSharedPositions; + + mIterDt = state.mIterDt; + mNumIterations = state.mRemainingIterations; + mIterationData = iterationData; + + Simd4f logStiffness = simd4f(0.0f, cloth.mSelfCollisionLogStiffness, cloth.mMotionConstraintLogStiffness, + cloth.mTetherConstraintLogStiffness); + Simd4f stiffnessExponent = simd4f(cloth.mStiffnessFrequency * mIterDt); + Simd4f stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent); + + mTetherConstraintStiffness = array(stiffness)[3]; + mMotionConstraintStiffness = array(stiffness)[2]; + mSelfCollisionStiffness = array(stiffness)[1]; + + logStiffness = simd4f(cloth.mDragLogCoefficient, cloth.mLiftLogCoefficient, 0.0f, 0.0f); + stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent); + mDragCoefficient = array(stiffness)[0]; + mLiftCoefficient = array(stiffness)[1]; + for(int i = 0; i < 9; ++i) + mRotation[i] = array(state.mRotationMatrix[i / 3])[i % 3]; + + mTargetMotionConstraints = 0; + if(!cloth.mMotionConstraints.mStart.empty()) + { + mTargetMotionConstraints = array(*cloth.mMotionConstraints.mStart.begin().get()); + } + + mStartMotionConstraints = mTargetMotionConstraints; + if(!cloth.mMotionConstraints.mTarget.empty()) + { + mTargetMotionConstraints = array(*cloth.mMotionConstraints.mTarget.begin().get()); + } + + mHostMotionConstraints = array(*getDevicePointer(cloth.mMotionConstraints.mHostCopy)); + + mTargetSeparationConstraints = 0; + if(!cloth.mSeparationConstraints.mStart.empty()) + { + mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mStart.begin().get()); + } + + mStartSeparationConstraints = mTargetSeparationConstraints; + if(!cloth.mSeparationConstraints.mTarget.empty()) + { + mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mTarget.begin().get()); + } + + mHostSeparationConstraints = array(*getDevicePointer(cloth.mSeparationConstraints.mHostCopy)); + + mParticleAccelerations = 0; + if(!cloth.mParticleAccelerations.empty()) + { + mParticleAccelerations = array(*cloth.mParticleAccelerations.begin().get()); + } + + mHostParticleAccelerations = array(*getDevicePointer(cloth.mParticleAccelerationsHostCopy)); + + mRestPositions = 0; + if(!cloth.mRestPositions.empty()) + { + mRestPositions = array(*cloth.mRestPositions.begin().get()); + } + + mStartCollisionSpheres = array(*getDevicePointer(cloth.mStartCollisionSpheres)); + mTargetCollisionSpheres = array(*getDevicePointer(cloth.mTargetCollisionSpheres)); + + if(!mTargetCollisionSpheres) + mTargetCollisionSpheres = mStartCollisionSpheres; + + mStartCollisionPlanes = array(*getDevicePointer(cloth.mStartCollisionPlanes)); + mTargetCollisionPlanes = array(*getDevicePointer(cloth.mTargetCollisionPlanes)); + + if(!mTargetCollisionPlanes) + mTargetCollisionPlanes = mStartCollisionPlanes; + + mStartCollisionTriangles = array(*getDevicePointer(cloth.mStartCollisionTriangles)); + mTargetCollisionTriangles = array(*getDevicePointer(cloth.mTargetCollisionTriangles)); + + if(!mTargetCollisionTriangles) + mTargetCollisionTriangles = mStartCollisionTriangles; + + for(uint32_t i = 0; i < 3; ++i) + { + float c = cloth.mParticleBoundsCenter[i]; + float r = cloth.mParticleBoundsHalfExtent[i]; + mParticleBounds[i * 2 + 0] = r + c; + mParticleBounds[i * 2 + 1] = r - c; + } + + mSleepPassCounter = cloth.mSleepPassCounter; + mSleepTestCounter = cloth.mSleepTestCounter; + + mStiffnessExponent = cloth.mStiffnessFrequency * mIterDt; +} + +namespace +{ +void copySquareTransposed(float* dst, const float* src) +{ + dst[0] = src[0]; + dst[1] = src[4]; + dst[2] = src[8]; + dst[3] = src[1]; + dst[4] = src[5]; + dst[5] = src[9]; + dst[6] = src[2]; + dst[7] = src[6]; + dst[8] = src[10]; +} +} + +cloth::CuIterationData::CuIterationData(const IterationState<Simd4f>& state) +{ + mIntegrationTrafo[0] = array(state.mPrevBias)[0]; + mIntegrationTrafo[1] = array(state.mPrevBias)[1]; + mIntegrationTrafo[2] = array(state.mPrevBias)[2]; + + mIntegrationTrafo[3] = array(state.mCurBias)[0]; + mIntegrationTrafo[4] = array(state.mCurBias)[1]; + mIntegrationTrafo[5] = array(state.mCurBias)[2]; + + copySquareTransposed(mIntegrationTrafo + 6, array(*state.mPrevMatrix)); + copySquareTransposed(mIntegrationTrafo + 15, array(*state.mCurMatrix)); + + mWind[0] = array(state.mWind)[0]; + mWind[1] = array(state.mWind)[1]; + mWind[2] = array(state.mWind)[2]; + + mIsTurning = state.mIsTurning ? 0x3F800000u : 0; // 1.0f to avoid ftz +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h new file mode 100644 index 00000000..0be66742 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h @@ -0,0 +1,191 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#ifndef __CUDACC__ +#include "Simd.h" +#endif + +namespace physx +{ +namespace cloth +{ + +class CuCloth; +struct CuPhaseConfig; +template <typename> +struct IterationState; +struct IndexPair; +struct CuIterationData; +struct CuTether; + +// reference to cloth instance bulk data (POD) +// should not need frequent updates (stored on device) +struct CuClothData +{ + CuClothData() + { + } + CuClothData(CuCloth&); + + // particle data + uint32_t mNumParticles; + float* mParticles; + float* mParticlesHostCopy; + + // fabric constraints + uint32_t mNumPhases; + const CuPhaseConfig* mPhaseConfigs; + + const CuTether* mTethers; + uint32_t mNumTethers; + float mTetherConstraintScale; + + const uint16_t* mTriangles; + uint32_t mNumTriangles; + + // motion constraint data + float mMotionConstraintScale; + float mMotionConstraintBias; + + // collision data + uint32_t mNumSpheres; // don't change this order, it's + uint32_t mNumCapsules; // needed by mergeAcceleration() + const IndexPair* mCapsuleIndices; + uint32_t mNumPlanes; + uint32_t mNumConvexes; + const uint32_t* mConvexMasks; + uint32_t mNumCollisionTriangles; + + // virtual particle data + const uint32_t* mVirtualParticleSetSizesBegin; + const uint32_t* mVirtualParticleSetSizesEnd; + const uint16_t* mVirtualParticleIndices; + const float* mVirtualParticleWeights; + + bool mEnableContinuousCollision; + float mCollisionMassScale; + float mFrictionScale; + + float mSelfCollisionDistance; + uint32_t mNumSelfCollisionIndices; + const uint32_t* mSelfCollisionIndices; + float* mSelfCollisionParticles; + uint32_t* mSelfCollisionKeys; + uint16_t* mSelfCollisionCellStart; + + // sleep data + uint32_t mSleepTestInterval; + uint32_t mSleepAfterCount; + float mSleepThreshold; +}; + +// per-frame data (stored in pinned memory) +struct CuFrameData +{ + CuFrameData() + { + } // not initializing pointers to 0! + +#ifndef __CUDACC__ + explicit CuFrameData(CuCloth&, uint32_t, const IterationState<Simd4f>&, const CuIterationData*); +#endif + + bool mDeviceParticlesDirty; + + // number of particle copies that fit in shared memory (0, 1, or 2) + uint32_t mNumSharedPositions; + + // iteration data + float mIterDt; + uint32_t mNumIterations; + const CuIterationData* mIterationData; + + float mTetherConstraintStiffness; + + // wind data + float mDragCoefficient; + float mLiftCoefficient; + float mRotation[9]; + + // motion constraint data + const float* mStartMotionConstraints; + float* mTargetMotionConstraints; + const float* mHostMotionConstraints; + float mMotionConstraintStiffness; + + // separation constraint data + const float* mStartSeparationConstraints; + float* mTargetSeparationConstraints; + const float* mHostSeparationConstraints; + + // particle acceleration data + float* mParticleAccelerations; + const float* mHostParticleAccelerations; + + // rest positions + const float* mRestPositions; + + // collision data + const float* mStartCollisionSpheres; + const float* mTargetCollisionSpheres; + const float* mStartCollisionPlanes; + const float* mTargetCollisionPlanes; + const float* mStartCollisionTriangles; + const float* mTargetCollisionTriangles; + + float mSelfCollisionStiffness; + + float mParticleBounds[6]; // maxX, -minX, maxY, ... + + uint32_t mSleepPassCounter; + uint32_t mSleepTestCounter; + + float mStiffnessExponent; +}; + +// per-iteration data (stored in pinned memory) +struct CuIterationData +{ + CuIterationData() + { + } // not initializing! + +#ifndef __CUDACC__ + explicit CuIterationData(const IterationState<Simd4f>&); +#endif + + float mIntegrationTrafo[24]; + float mWind[3]; + uint32_t mIsTurning; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h new file mode 100644 index 00000000..cd28a999 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h @@ -0,0 +1,1505 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifndef CU_SOLVER_KERNEL_CU +#error include CuCollision.h only from CuSolverKernel.cu +#endif + +#include "IndexPair.h" + +namespace +{ +struct CuCollision +{ + struct ShapeMask + { + uint32_t mSpheres; + uint32_t mCones; + + __device__ friend ShapeMask& operator&=(ShapeMask& left, const ShapeMask& right) + { + left.mSpheres = left.mSpheres & right.mSpheres; + left.mCones = left.mCones & right.mCones; + return left; + } + }; + + struct CollisionData + { + Pointer<Shared, float> mSphereX; + Pointer<Shared, float> mSphereY; + Pointer<Shared, float> mSphereZ; + Pointer<Shared, float> mSphereW; + + Pointer<Shared, float> mConeCenterX; + Pointer<Shared, float> mConeCenterY; + Pointer<Shared, float> mConeCenterZ; + Pointer<Shared, float> mConeRadius; + Pointer<Shared, float> mConeAxisX; + Pointer<Shared, float> mConeAxisY; + Pointer<Shared, float> mConeAxisZ; + Pointer<Shared, float> mConeSlope; + Pointer<Shared, float> mConeSqrCosine; + Pointer<Shared, float> mConeHalfLength; + }; + + public: + __device__ CuCollision(Pointer<Shared, uint32_t>); + + template <typename CurrentT, typename PreviousT> + __device__ void operator()(CurrentT& current, PreviousT& previous, float alpha); + + private: + __device__ void buildSphereAcceleration(const CollisionData&); + __device__ void buildConeAcceleration(); + __device__ void mergeAcceleration(); + + template <typename CurrentT> + __device__ bool buildAcceleration(const CurrentT&, float); + + __device__ static ShapeMask readShapeMask(const float&, Pointer<Shared, const uint32_t>); + template <typename CurPos> + __device__ ShapeMask getShapeMask(const CurPos&) const; + template <typename PrevPos, typename CurPos> + __device__ ShapeMask getShapeMask(const PrevPos&, const CurPos&) const; + + template <typename CurPos> + __device__ int32_t collideCapsules(const CurPos&, float3&, float3&) const; + template <typename PrevPos, typename CurPos> + __device__ int32_t collideCapsules(const PrevPos&, CurPos&, float3&, float3&) const; + + template <typename CurrentT, typename PreviousT> + __device__ void collideCapsules(CurrentT& current, PreviousT& previous) const; + template <typename CurrentT, typename PreviousT> + __device__ void collideVirtualCapsules(CurrentT& current, PreviousT& previous) const; + template <typename CurrentT, typename PreviousT> + __device__ void collideContinuousCapsules(CurrentT& current, PreviousT& previous) const; + + template <typename CurrentT, typename PreviousT> + __device__ void collideConvexes(CurrentT& current, PreviousT& previous, float alpha); + template <typename CurPos> + __device__ int32_t collideConvexes(const CurPos&, float3&) const; + + template <typename CurrentT> + __device__ void collideTriangles(CurrentT& current, float alpha); + template <typename CurrentT> + __device__ void collideTriangles(CurrentT& current, int32_t i); + + public: + Pointer<Shared, uint32_t> mCapsuleIndices; + Pointer<Shared, uint32_t> mCapsuleMasks; + Pointer<Shared, uint32_t> mConvexMasks; + + CollisionData mPrevData; + CollisionData mCurData; + + // acceleration structure + Pointer<Shared, uint32_t> mShapeGrid; + float mGridScale[3]; + float mGridBias[3]; + static const uint32_t sGridSize = 8; +}; + +template <typename T> +__device__ void swap(T& a, T& b) +{ + T c = a; + a = b; + b = c; +} +} + +__shared__ uninitialized<CuCollision> gCollideParticles; + +namespace +{ +// initializes one pointer past data! +__device__ void allocate(CuCollision::CollisionData& data) +{ + if(threadIdx.x < 15) + { + Pointer<Shared, float>* ptr = &data.mSphereX; + ptr[threadIdx.x] = *ptr + threadIdx.x * gClothData.mNumCapsules + + min(threadIdx.x, 4) * (gClothData.mNumSpheres - gClothData.mNumCapsules); + } +} + +__device__ void generateSpheres(CuCollision::CollisionData& data, float alpha) +{ + // interpolate spheres and transpose + if(threadIdx.x < gClothData.mNumSpheres * 4) + { + float start = __ldg(gFrameData.mStartCollisionSpheres + threadIdx.x); + float target = __ldg(gFrameData.mTargetCollisionSpheres + threadIdx.x); + float value = start + (target - start) * alpha; + if(threadIdx.x % 4 == 3) + value = max(value, 0.0f); + int32_t j = threadIdx.x % 4 * gClothData.mNumSpheres + threadIdx.x / 4; + data.mSphereX[j] = value; + } + + __syncthreads(); +} + +__device__ void generateCones(CuCollision::CollisionData& data, Pointer<Shared, const uint32_t> iIt) +{ + // generate cones + if(threadIdx.x < gClothData.mNumCapsules) + { + uint32_t firstIndex = iIt[0]; + uint32_t secondIndex = iIt[1]; + + float firstX = data.mSphereX[firstIndex]; + float firstY = data.mSphereY[firstIndex]; + float firstZ = data.mSphereZ[firstIndex]; + float firstW = data.mSphereW[firstIndex]; + + float secondX = data.mSphereX[secondIndex]; + float secondY = data.mSphereY[secondIndex]; + float secondZ = data.mSphereZ[secondIndex]; + float secondW = data.mSphereW[secondIndex]; + + float axisX = (secondX - firstX) * 0.5f; + float axisY = (secondY - firstY) * 0.5f; + float axisZ = (secondZ - firstZ) * 0.5f; + float axisW = (secondW - firstW) * 0.5f; + + float sqrAxisLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + float sqrConeLength = sqrAxisLength - axisW * axisW; + + float invAxisLength = rsqrtf(sqrAxisLength); + float invConeLength = rsqrtf(sqrConeLength); + + if(sqrConeLength <= 0.0f) + invAxisLength = invConeLength = 0.0f; + + float axisLength = sqrAxisLength * invAxisLength; + + data.mConeCenterX[threadIdx.x] = (secondX + firstX) * 0.5f; + data.mConeCenterY[threadIdx.x] = (secondY + firstY) * 0.5f; + data.mConeCenterZ[threadIdx.x] = (secondZ + firstZ) * 0.5f; + data.mConeRadius[threadIdx.x] = (axisW + firstW) * invConeLength * axisLength; + + data.mConeAxisX[threadIdx.x] = axisX * invAxisLength; + data.mConeAxisY[threadIdx.x] = axisY * invAxisLength; + data.mConeAxisZ[threadIdx.x] = axisZ * invAxisLength; + data.mConeSlope[threadIdx.x] = axisW * invConeLength; + + float sine = axisW * invAxisLength; + data.mConeSqrCosine[threadIdx.x] = 1 - sine * sine; + data.mConeHalfLength[threadIdx.x] = axisLength; + } + + __syncthreads(); +} +} + +__device__ CuCollision::CuCollision(Pointer<Shared, uint32_t> scratchPtr) +{ + int32_t numCapsules2 = 2 * gClothData.mNumCapsules; + int32_t numCapsules4 = 4 * gClothData.mNumCapsules; + int32_t numConvexes = gClothData.mNumConvexes; + + if(threadIdx.x < 3) + { + (&mCapsuleIndices)[threadIdx.x] = scratchPtr + threadIdx.x * numCapsules2; + (&mShapeGrid)[-14 * int32_t(threadIdx.x)] = scratchPtr + numCapsules4 + numConvexes; + } + + Pointer<Shared, uint32_t> indexPtr = scratchPtr + threadIdx.x; + if(threadIdx.x < numCapsules2) + { + uint32_t index = (&gClothData.mCapsuleIndices->first)[threadIdx.x]; + *indexPtr = index; + + volatile uint32_t* maskPtr = generic(indexPtr + numCapsules2); + *maskPtr = 1u << index; + *maskPtr |= maskPtr[-int32_t(threadIdx.x & 1)]; + } + indexPtr += numCapsules4; + + if(threadIdx.x < numConvexes) + *indexPtr = gClothData.mConvexMasks[threadIdx.x]; + + if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f) + { + allocate(mPrevData); + + __syncthreads(); // mPrevData raw hazard + + generateSpheres(mPrevData, 0.0f); + generateCones(mPrevData, mCapsuleIndices + 2 * threadIdx.x); + } + + allocate(mCurData); // also initializes mShapeGrid (!) +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::operator()(CurrentT& current, PreviousT& previous, float alpha) +{ + // if(current.w > 0) current.w = previous.w (see SwSolverKernel::computeBounds()) + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + if(current(i, 3) > 0.0f) + current(i, 3) = previous(i, 3); + } + + collideConvexes(current, previous, alpha); + collideTriangles(current, alpha); + + if(buildAcceleration(current, alpha)) + { + if(gClothData.mEnableContinuousCollision) + collideContinuousCapsules(current, previous); + else + collideCapsules(current, previous); + + collideVirtualCapsules(current, previous); + } + + // sync otherwise first threads overwrite sphere data before + // remaining ones have had a chance to use it leading to incorrect + // velocity calculation for friction / ccd + + __syncthreads(); + + if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f) + { + // store current collision data for next iteration + Pointer<Shared, float> dstIt = mPrevData.mSphereX + threadIdx.x; + Pointer<Shared, const float> srcIt = mCurData.mSphereX + threadIdx.x; + for(; dstIt < mCurData.mSphereX; dstIt += blockDim.x, srcIt += blockDim.x) + *dstIt = *srcIt; + } + + // __syncthreads() called in updateSleepState() +} + +// build per-axis mask arrays of spheres on the right/left of grid cell +__device__ void CuCollision::buildSphereAcceleration(const CollisionData& data) +{ + if(threadIdx.x >= 192) + return; + + int32_t sphereIdx = threadIdx.x & 31; + int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) + int32_t signi = threadIdx.x << 26 & 0x80000000; // sign bit (min or max) + + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + float pos = signf * data.mSphereW[sphereIdx] + data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; + + // use overflow so we can test for non-positive + uint32_t index = signi - uint32_t(floorf(pos * mGridScale[axisIdx] + mGridBias[axisIdx])); + + axisIdx += (uint32_t(signi) >> 31) * 3; + Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx; + // #pragma unroll + for(int32_t i = 0; i < sGridSize; ++i, ++index) + dst[i] |= __ballot(int32_t(index) <= 0); +} + +// generate cone masks from sphere masks +__device__ void CuCollision::buildConeAcceleration() +{ + if(threadIdx.x >= 192) + return; + + int32_t coneIdx = threadIdx.x & 31; + + uint32_t sphereMask = + mCurData.mConeRadius[coneIdx] && coneIdx < gClothData.mNumCapsules ? mCapsuleMasks[2 * coneIdx + 1] : 0; + + int32_t offset = threadIdx.x / 32 * sGridSize; + Pointer<Shared, uint32_t> src = mShapeGrid + offset; + Pointer<Shared, uint32_t> dst = src + 6 * sGridSize; + + // #pragma unroll + for(int32_t i = 0; i < sGridSize; ++i) + dst[i] |= __ballot(src[i] & sphereMask); +} + +// convert right/left mask arrays into single overlap array +__device__ void CuCollision::mergeAcceleration() +{ + if(threadIdx.x < sGridSize * 12) + { + Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x; + if(!(gClothData.mEnableContinuousCollision || threadIdx.x * 43 & 1024)) + *dst &= dst[sGridSize * 3]; // above is same as 'threadIdx.x/24 & 1' + + // mask garbage bits from build*Acceleration + int32_t shapeIdx = threadIdx.x >= sGridSize * 6; // spheres=0, cones=1 + *dst &= (1 << (&gClothData.mNumSpheres)[shapeIdx]) - 1; + } +} + +namespace +{ +#if __CUDA_ARCH__ >= 300 +__device__ float mergeBounds(Pointer<Shared, float> buffer) +{ + float value = *buffer; + value = max(value, __shfl_down(value, 1)); + value = max(value, __shfl_down(value, 2)); + value = max(value, __shfl_down(value, 4)); + value = max(value, __shfl_down(value, 8)); + return max(value, __shfl_down(value, 16)); +} +#else +__device__ float mergeBounds(Pointer<Shared, float> buffer) +{ + // ensure that writes to buffer are visible to all threads + __threadfence_block(); + + volatile float* ptr = generic(buffer); + *ptr = max(*ptr, ptr[16]); + *ptr = max(*ptr, ptr[8]); + *ptr = max(*ptr, ptr[4]); + *ptr = max(*ptr, ptr[2]); + return max(*ptr, ptr[1]); +} +#endif +// computes maxX, -minX, maxY, ... with a stride of 32, threadIdx.x must be < 192 +__device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer) +{ + assert(threadIdx.x < 192); + + int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index + int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) + int32_t signi = threadIdx.x << 26; // sign bit (min or max) + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + + *buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; + + return mergeBounds(buffer); +} + +#if __CUDA_ARCH__ >= 300 +template <typename CurrentT> +__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer) +{ + int32_t numThreadsPerAxis = blockDim.x * 342 >> 10 & ~31; // same as / 3 + int32_t axis = (threadIdx.x >= numThreadsPerAxis) + (threadIdx.x >= 2 * numThreadsPerAxis); + int32_t threadIdxInAxis = threadIdx.x - axis * numThreadsPerAxis; + int laneIdx = threadIdx.x & 31; + + if(threadIdxInAxis < numThreadsPerAxis) + { + typename CurrentT::ConstPointerType posIt = current[axis]; + int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1); + float minX = posIt[i], maxX = minX; + while(i += numThreadsPerAxis, i < gClothData.mNumParticles) + { + float posX = posIt[i]; + minX = min(minX, posX); + maxX = max(maxX, posX); + } + + minX = min(minX, __shfl_down(minX, 1)); + maxX = max(maxX, __shfl_down(maxX, 1)); + minX = min(minX, __shfl_down(minX, 2)); + maxX = max(maxX, __shfl_down(maxX, 2)); + minX = min(minX, __shfl_down(minX, 4)); + maxX = max(maxX, __shfl_down(maxX, 4)); + minX = min(minX, __shfl_down(minX, 8)); + maxX = max(maxX, __shfl_down(maxX, 8)); + minX = min(minX, __shfl_down(minX, 16)); + maxX = max(maxX, __shfl_down(maxX, 16)); + + if(!laneIdx) + { + Pointer<Shared, float> dst = buffer - threadIdx.x + (threadIdxInAxis >> 5) + (axis << 6); + dst[0] = maxX; + dst[32] = -minX; + } + } + + __syncthreads(); + + if(threadIdx.x >= 192) + return 0.0f; + + float value = *buffer; + if(laneIdx >= (numThreadsPerAxis >> 5)) + value = -FLT_MAX; + + // blockDim.x <= 3*512, increase to 3*1024 by adding a shfl by 16 + assert(numThreadsPerAxis <= 16 * 32); + + value = max(value, __shfl_down(value, 1)); + value = max(value, __shfl_down(value, 2)); + value = max(value, __shfl_down(value, 4)); + return max(value, __shfl_down(value, 8)); +} +#else +template <typename CurrentT> +__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer) +{ + if(threadIdx.x >= 192) + return 0.0f; + + int32_t axisIdx = threadIdx.x >> 6; // x, y, or z + int32_t signi = threadIdx.x << 26; // sign bit (min or max) + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + + typename CurrentT::ConstPointerType pIt = current[axisIdx]; + typename CurrentT::ConstPointerType pEnd = pIt + gClothData.mNumParticles; + pIt += min(threadIdx.x & 31, gClothData.mNumParticles - 1); + + *buffer = *pIt * signf; + while(pIt += 32, pIt < pEnd) + *buffer = max(*buffer, *pIt * signf); + + return mergeBounds(buffer); +} +#endif +} + +// build mask of spheres/cones touching a regular grid along each axis +template <typename CurrentT> +__device__ bool CuCollision::buildAcceleration(const CurrentT& current, float alpha) +{ + // use still unused cone data as buffer for bounds computation + Pointer<Shared, float> buffer = mCurData.mConeCenterX + threadIdx.x; + float curParticleBounds = computeParticleBounds(current, buffer); + int32_t warpIdx = threadIdx.x >> 5; + + if(!gClothData.mNumSpheres) + { + if(threadIdx.x < 192 && !(threadIdx.x & 31)) + gFrameData.mParticleBounds[warpIdx] = curParticleBounds; + return false; + } + + generateSpheres(mCurData, alpha); + + if(threadIdx.x < 192) + { + float sphereBounds = computeSphereBounds(mCurData, buffer); + float particleBounds = curParticleBounds; + if(gClothData.mEnableContinuousCollision) + { + sphereBounds = max(sphereBounds, computeSphereBounds(mPrevData, buffer)); + float prevParticleBounds = gFrameData.mParticleBounds[warpIdx]; + particleBounds = max(particleBounds, prevParticleBounds); + } + + float bounds = min(sphereBounds, particleBounds); + float expandedBounds = bounds + abs(bounds) * 1e-4f; + + // store bounds data in shared memory + if(!(threadIdx.x & 31)) + { + mGridScale[warpIdx] = expandedBounds; + gFrameData.mParticleBounds[warpIdx] = curParticleBounds; + } + } + + __syncthreads(); // mGridScale raw hazard + + if(threadIdx.x < 3) + { + float negativeLower = mGridScale[threadIdx.x * 2 + 1]; + float edgeLength = mGridScale[threadIdx.x * 2] + negativeLower; + float divisor = max(edgeLength, FLT_EPSILON); + mGridScale[threadIdx.x] = __fdividef(sGridSize - 1e-3, divisor); + mGridBias[threadIdx.x] = negativeLower * mGridScale[threadIdx.x]; + if(edgeLength < 0.0f) + mGridScale[0] = 0.0f; // mark empty intersection + } + + // initialize sphere *and* cone grid to 0 + if(threadIdx.x < 2 * 6 * sGridSize) + mShapeGrid[threadIdx.x] = 0; + + __syncthreads(); // mGridScale raw hazard + + // generate cones even if test below fails because + // continuous collision might need it in next iteration + generateCones(mCurData, mCapsuleIndices + 2 * threadIdx.x); + + if(mGridScale[0] == 0.0f) + return false; // early out for empty intersection + + if(gClothData.mEnableContinuousCollision) + buildSphereAcceleration(mPrevData); + buildSphereAcceleration(mCurData); + __syncthreads(); // mCurData raw hazard + + buildConeAcceleration(); + __syncthreads(); // mShapeGrid raw hazard + + mergeAcceleration(); + __syncthreads(); // mShapeGrid raw hazard + + return true; +} + +__device__ CuCollision::ShapeMask CuCollision::readShapeMask(const float& position, + Pointer<Shared, const uint32_t> sphereGrid) +{ + ShapeMask result; + int32_t index = int32_t(floorf(position)); + uint32_t outMask = (index < sGridSize) - 1; + + Pointer<Shared, const uint32_t> gridPtr = sphereGrid + (index & sGridSize - 1); + result.mSpheres = gridPtr[0] & ~outMask; + result.mCones = gridPtr[sGridSize * 6] & ~outMask; + + return result; +} + +// lookup acceleration structure and return mask of potential intersectors +template <typename CurPos> +__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const CurPos& positions) const +{ + ShapeMask result; + + result = readShapeMask(positions.x * mGridScale[0] + mGridBias[0], mShapeGrid); + result &= readShapeMask(positions.y * mGridScale[1] + mGridBias[1], mShapeGrid + 8); + result &= readShapeMask(positions.z * mGridScale[2] + mGridBias[2], mShapeGrid + 16); + + return result; +} + +template <typename PrevPos, typename CurPos> +__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const PrevPos& prevPos, const CurPos& curPos) const +{ + ShapeMask result; + + float prevX = prevPos.x * mGridScale[0] + mGridBias[0]; + float prevY = prevPos.y * mGridScale[1] + mGridBias[1]; + float prevZ = prevPos.z * mGridScale[2] + mGridBias[2]; + + float curX = curPos.x * mGridScale[0] + mGridBias[0]; + float curY = curPos.y * mGridScale[1] + mGridBias[1]; + float curZ = curPos.z * mGridScale[2] + mGridBias[2]; + + float maxX = min(max(prevX, curX), 7.0f); + float maxY = min(max(prevY, curY), 7.0f); + float maxZ = min(max(prevZ, curZ), 7.0f); + + result = readShapeMask(maxX, mShapeGrid); + result &= readShapeMask(maxY, mShapeGrid + 8); + result &= readShapeMask(maxZ, mShapeGrid + 16); + + float minX = max(min(prevX, curX), 0.0f); + float minY = max(min(prevY, curY), 0.0f); + float minZ = max(min(prevZ, curZ), 0.0f); + + result &= readShapeMask(minX, mShapeGrid + 24); + result &= readShapeMask(minY, mShapeGrid + 32); + result &= readShapeMask(minZ, mShapeGrid + 40); + + return result; +} + +template <typename CurPos> +__device__ int32_t CuCollision::collideCapsules(const CurPos& positions, float3& delta, float3& velocity) const +{ + ShapeMask shapeMask = getShapeMask(positions); + + delta.x = delta.y = delta.z = 0.0f; + velocity.x = velocity.y = velocity.z = 0.0f; + + int32_t numCollisions = 0; + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + // cone collision + for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1) + { + int32_t j = __ffs(shapeMask.mCones) - 1; + + float deltaX = positions.x - mCurData.mConeCenterX[j]; + float deltaY = positions.y - mCurData.mConeCenterY[j]; + float deltaZ = positions.z - mCurData.mConeCenterZ[j]; + + float axisX = mCurData.mConeAxisX[j]; + float axisY = mCurData.mConeAxisY[j]; + float axisZ = mCurData.mConeAxisZ[j]; + float slope = mCurData.mConeSlope[j]; + + float dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; + float radius = max(dot * slope + mCurData.mConeRadius[j], 0.0f); + float sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot; + + Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j; + uint32_t bothMask = mIt[1]; + + if(sqrDistance > radius * radius) + { + shapeMask.mSpheres &= ~bothMask; + continue; + } + + sqrDistance = max(sqrDistance, FLT_EPSILON); + float invDistance = rsqrtf(sqrDistance); + + float base = dot + slope * sqrDistance * invDistance; + + float halfLength = mCurData.mConeHalfLength[j]; + uint32_t leftMask = base < -halfLength; + uint32_t rightMask = base > halfLength; + + uint32_t firstMask = mIt[0]; + uint32_t secondMask = firstMask ^ bothMask; + + shapeMask.mSpheres &= ~(firstMask & leftMask - 1); + shapeMask.mSpheres &= ~(secondMask & rightMask - 1); + + if(!leftMask && !rightMask) + { + deltaX = deltaX - base * axisX; + deltaY = deltaY - base * axisY; + deltaZ = deltaZ - base * axisZ; + + float sqrCosine = mCurData.mConeSqrCosine[j]; + float scale = radius * invDistance * sqrCosine - sqrCosine; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if(frictionEnabled) + { + int32_t s0 = mCapsuleIndices[2 * j]; + int32_t s1 = mCapsuleIndices[2 * j + 1]; + + // load previous sphere pos + float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0]; + float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0]; + float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0]; + + float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1]; + float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1]; + float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1]; + + // interpolate velocity between the two spheres + float t = dot * 0.5f + 0.5f; + + velocity.x += s0vx + t * (s1vx - s0vx); + velocity.y += s0vy + t * (s1vy - s0vy); + velocity.z += s0vz + t * (s1vz - s0vz); + } + + ++numCollisions; + } + } + + // sphere collision + for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1) + { + int32_t j = __ffs(shapeMask.mSpheres) - 1; + + float deltaX = positions.x - mCurData.mSphereX[j]; + float deltaY = positions.y - mCurData.mSphereY[j]; + float deltaZ = positions.z - mCurData.mSphereZ[j]; + + float sqrDistance = FLT_EPSILON + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + float relDistance = rsqrtf(sqrDistance) * mCurData.mSphereW[j]; + + if(relDistance > 1.0f) + { + float scale = relDistance - 1.0f; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if(frictionEnabled) + { + velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j]; + velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j]; + velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j]; + } + + ++numCollisions; + } + } + + return numCollisions; +} + +static const __device__ float gSkeletonWidth = (1 - 0.2f) * (1 - 0.2f) - 1; + +template <typename PrevPos, typename CurPos> +__device__ int32_t +CuCollision::collideCapsules(const PrevPos& prevPos, CurPos& curPos, float3& delta, float3& velocity) const +{ + ShapeMask shapeMask = getShapeMask(prevPos, curPos); + + delta.x = delta.y = delta.z = 0.0f; + velocity.x = velocity.y = velocity.z = 0.0f; + + int32_t numCollisions = 0; + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + // cone collision + for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1) + { + int32_t j = __ffs(shapeMask.mCones) - 1; + + float prevAxisX = mPrevData.mConeAxisX[j]; + float prevAxisY = mPrevData.mConeAxisY[j]; + float prevAxisZ = mPrevData.mConeAxisZ[j]; + float prevSlope = mPrevData.mConeSlope[j]; + + float prevX = prevPos.x - mPrevData.mConeCenterX[j]; + float prevY = prevPos.y - mPrevData.mConeCenterY[j]; + float prevZ = prevPos.z - mPrevData.mConeCenterZ[j]; + float prevT = prevY * prevAxisZ - prevZ * prevAxisY; + float prevU = prevZ * prevAxisX - prevX * prevAxisZ; + float prevV = prevX * prevAxisY - prevY * prevAxisX; + float prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; + float prevRadius = max(prevDot * prevSlope + mCurData.mConeRadius[j], 0.0f); + + float curAxisX = mCurData.mConeAxisX[j]; + float curAxisY = mCurData.mConeAxisY[j]; + float curAxisZ = mCurData.mConeAxisZ[j]; + float curSlope = mCurData.mConeSlope[j]; + + float curX = curPos.x - mCurData.mConeCenterX[j]; + float curY = curPos.y - mCurData.mConeCenterY[j]; + float curZ = curPos.z - mCurData.mConeCenterZ[j]; + float curT = curY * curAxisZ - curZ * curAxisY; + float curU = curZ * curAxisX - curX * curAxisZ; + float curV = curX * curAxisY - curY * curAxisX; + float curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + float curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f); + + float curSqrDistance = FLT_EPSILON + curT * curT + curU * curU + curV * curV; + + float dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius; + float dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius; + float dotCurCur = curSqrDistance - curRadius * curRadius; + + float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + float sqrtD = sqrtf(discriminant); + float halfB = dotPrevCur - dotPrevPrev; + float minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside cone + float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA); + bool hasCollision = toi < 1.0f && halfB < sqrtD; + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + hasCollision = hasCollision && minusA < -FLT_EPSILON; + + if(hasCollision) + { + float deltaX = prevX - curX; + float deltaY = prevY - curY; + float deltaZ = prevZ - curZ; + + // interpolate delta at toi + float posX = prevX - deltaX * toi; + float posY = prevY - deltaY * toi; + float posZ = prevZ - deltaZ * toi; + + float curHalfLength = mCurData.mConeHalfLength[j]; + float curScaledAxisX = curAxisX * curHalfLength; + float curScaledAxisY = curAxisY * curHalfLength; + float curScaledAxisZ = curAxisZ * curHalfLength; + + float prevHalfLength = mPrevData.mConeHalfLength[j]; + float deltaScaledAxisX = curScaledAxisX - prevAxisX * prevHalfLength; + float deltaScaledAxisY = curScaledAxisY - prevAxisY * prevHalfLength; + float deltaScaledAxisZ = curScaledAxisZ - prevAxisZ * prevHalfLength; + + float oneMinusToi = 1.0f - toi; + + // interpolate axis at toi + float axisX = curScaledAxisX - deltaScaledAxisX * oneMinusToi; + float axisY = curScaledAxisY - deltaScaledAxisY * oneMinusToi; + float axisZ = curScaledAxisZ - deltaScaledAxisZ * oneMinusToi; + float slope = prevSlope * oneMinusToi + curSlope * toi; + + float sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + float invHalfLength = rsqrtf(sqrHalfLength); + float dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength; + + float sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot; + float invDistance = sqrDistance > 0.0f ? rsqrtf(sqrDistance) : 0.0f; + + float base = dot + slope * sqrDistance * invDistance; + float scale = base * invHalfLength; + + if(abs(scale) < 1.0f) + { + deltaX = deltaX + deltaScaledAxisX * scale; + deltaY = deltaY + deltaScaledAxisY * scale; + deltaZ = deltaZ + deltaScaledAxisZ * scale; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + float minusK = __fdividef(sqrtD, minusA * oneMinusToi); + oneMinusToi = __fdividef(oneMinusToi, 1 - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f); + curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot; + + curPos.x = mCurData.mConeCenterX[j] + curX; + curPos.y = mCurData.mConeCenterY[j] + curY; + curPos.z = mCurData.mConeCenterZ[j] + curZ; + } + } + + // curPos inside cone (discrete collision) + bool hasContact = curRadius * curRadius > curSqrDistance; + + Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j; + uint32_t bothMask = mIt[1]; + + uint32_t cullMask = bothMask & (hasCollision | hasContact) - 1; + shapeMask.mSpheres &= ~cullMask; + + if(!hasContact) + continue; + + float invDistance = curSqrDistance > 0.0f ? rsqrtf(curSqrDistance) : 0.0f; + float base = curDot + curSlope * curSqrDistance * invDistance; + + float halfLength = mCurData.mConeHalfLength[j]; + uint32_t leftMask = base < -halfLength; + uint32_t rightMask = base > halfLength; + + // can only skip continuous sphere collision if post-ccd position + // is on code side *and* particle had cone-ccd collision. + uint32_t firstMask = mIt[0]; + uint32_t secondMask = firstMask ^ bothMask; + cullMask = (firstMask & leftMask - 1) | (secondMask & rightMask - 1); + shapeMask.mSpheres &= ~cullMask | hasCollision - 1; + + if(!leftMask && !rightMask) + { + float deltaX = curX - base * curAxisX; + float deltaY = curY - base * curAxisY; + float deltaZ = curZ - base * curAxisZ; + + float sqrCosine = mCurData.mConeSqrCosine[j]; + float scale = curRadius * invDistance * sqrCosine - sqrCosine; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if(frictionEnabled) + { + int32_t s0 = mCapsuleIndices[2 * j]; + int32_t s1 = mCapsuleIndices[2 * j + 1]; + + // load previous sphere pos + float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0]; + float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0]; + float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0]; + + float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1]; + float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1]; + float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1]; + + // interpolate velocity between the two spheres + float t = curDot * 0.5f + 0.5f; + + velocity.x += s0vx + t * (s1vx - s0vx); + velocity.y += s0vy + t * (s1vy - s0vy); + velocity.z += s0vz + t * (s1vz - s0vz); + } + + ++numCollisions; + } + } + + // sphere collision + for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1) + { + int32_t j = __ffs(shapeMask.mSpheres) - 1; + + float prevX = prevPos.x - mPrevData.mSphereX[j]; + float prevY = prevPos.y - mPrevData.mSphereY[j]; + float prevZ = prevPos.z - mPrevData.mSphereZ[j]; + float prevRadius = mPrevData.mSphereW[j]; + + float curX = curPos.x - mCurData.mSphereX[j]; + float curY = curPos.y - mCurData.mSphereY[j]; + float curZ = curPos.z - mCurData.mSphereZ[j]; + float curRadius = mCurData.mSphereW[j]; + + float sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ; + + float dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius; + float dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius; + float dotCurCur = sqrDistance - curRadius * curRadius; + + float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + float sqrtD = sqrtf(discriminant); + float halfB = dotPrevCur - dotPrevPrev; + float minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside sphere + float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA); + bool hasCollision = toi < 1.0f && halfB < sqrtD; + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + hasCollision = hasCollision && minusA < -FLT_EPSILON; + + if(hasCollision) + { + float deltaX = prevX - curX; + float deltaY = prevY - curY; + float deltaZ = prevZ - curZ; + + float oneMinusToi = 1.0f - toi; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + float minusK = __fdividef(sqrtD, minusA * oneMinusToi); + oneMinusToi = __fdividef(oneMinusToi, 1 - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curPos.x = mCurData.mSphereX[j] + curX; + curPos.y = mCurData.mSphereY[j] + curY; + curPos.z = mCurData.mSphereZ[j] + curZ; + + sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ; + } + + float relDistance = rsqrtf(sqrDistance) * curRadius; + + if(relDistance > 1.0f) + { + float scale = relDistance - 1.0f; + + delta.x = delta.x + curX * scale; + delta.y = delta.y + curY * scale; + delta.z = delta.z + curZ * scale; + + if(frictionEnabled) + { + velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j]; + velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j]; + velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j]; + } + + ++numCollisions; + } + } + + return numCollisions; +} + +namespace +{ +template <typename PrevPos, typename CurPos> +__device__ inline float3 calcFrictionImpulse(const PrevPos& prevPos, const CurPos& curPos, const float3& shapeVelocity, + float scale, const float3& collisionImpulse) +{ + const float frictionScale = gClothData.mFrictionScale; + + // calculate collision normal + float deltaSq = collisionImpulse.x * collisionImpulse.x + collisionImpulse.y * collisionImpulse.y + + collisionImpulse.z * collisionImpulse.z; + + float rcpDelta = rsqrtf(deltaSq + FLT_EPSILON); + + float nx = collisionImpulse.x * rcpDelta; + float ny = collisionImpulse.y * rcpDelta; + float nz = collisionImpulse.z * rcpDelta; + + // calculate relative velocity scaled by number of collision + float rvx = curPos.x - prevPos.x - shapeVelocity.x * scale; + float rvy = curPos.y - prevPos.y - shapeVelocity.y * scale; + float rvz = curPos.z - prevPos.z - shapeVelocity.z * scale; + + // calculate magnitude of relative normal velocity + float rvn = rvx * nx + rvy * ny + rvz * nz; + + // calculate relative tangential velocity + float rvtx = rvx - rvn * nx; + float rvty = rvy - rvn * ny; + float rvtz = rvz - rvn * nz; + + // calculate magnitude of vt + float rcpVt = rsqrtf(rvtx * rvtx + rvty * rvty + rvtz * rvtz + FLT_EPSILON); + + // magnitude of friction impulse (cannot be larger than -|vt|) + float j = max(-frictionScale * deltaSq * rcpDelta * scale * rcpVt, -1.0f); + + return make_float3(rvtx * j, rvty * j, rvtz * j); +} +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideCapsules(CurrentT& current, PreviousT& previous) const +{ + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename CurrentT::VectorType curPos = current(i); + + float3 delta, velocity; + if(int32_t numCollisions = collideCapsules(curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + + if(frictionEnabled) + { + typename PreviousT::VectorType prevPos = previous(i); + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + + if(massScaleEnabled) + { + float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z; + float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq; + current(i, 3) = __fdividef(current(i, 3), massScale); + } + } + } +} + +namespace +{ +template <typename PointerT> +__device__ float lerp(PointerT pos, const int4& indices, const float4& weights) +{ + return pos[indices.x] * weights.x + pos[indices.y] * weights.y + pos[indices.z] * weights.z; +} + +template <typename PointerT> +__device__ void apply(PointerT pos, const int4& indices, const float4& weights, float delta) +{ + pos[indices.x] += delta * weights.x; + pos[indices.y] += delta * weights.y; + pos[indices.z] += delta * weights.z; +} +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideVirtualCapsules(CurrentT& current, PreviousT& previous) const +{ + const uint32_t* __restrict setSizeIt = gClothData.mVirtualParticleSetSizesBegin; + + if(!setSizeIt) + return; + + if(gClothData.mEnableContinuousCollision) + { + // copied from mergeAcceleration + Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x; + if(!(threadIdx.x * 43 & 1024) && threadIdx.x < sGridSize * 12) + *dst &= dst[sGridSize * 3]; + __syncthreads(); // mShapeGrid raw hazard + } + + const uint32_t* __restrict setSizeEnd = gClothData.mVirtualParticleSetSizesEnd; + const uint16_t* __restrict indicesEnd = gClothData.mVirtualParticleIndices; + const float4* __restrict weightsIt = reinterpret_cast<const float4*>(gClothData.mVirtualParticleWeights); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for(; setSizeIt != setSizeEnd; ++setSizeIt) + { + __syncthreads(); + + const uint16_t* __restrict indicesIt = indicesEnd + threadIdx.x * 4; + for(indicesEnd += *setSizeIt * 4; indicesIt < indicesEnd; indicesIt += blockDim.x * 4) + { + int4 indices = make_int4(indicesIt[0], indicesIt[1], indicesIt[2], indicesIt[3]); + + float4 weights = weightsIt[indices.w]; + + float3 curPos; + curPos.x = lerp(current[0], indices, weights); + curPos.y = lerp(current[1], indices, weights); + curPos.z = lerp(current[2], indices, weights); + + float3 delta, velocity; + if(int32_t numCollisions = collideCapsules(curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + float wscale = weights.w * scale; + + apply(current[0], indices, weights, delta.x * wscale); + apply(current[1], indices, weights, delta.y * wscale); + apply(current[2], indices, weights, delta.z * wscale); + + if(frictionEnabled) + { + float3 prevPos; + prevPos.x = lerp(previous[0], indices, weights); + prevPos.y = lerp(previous[1], indices, weights); + prevPos.z = lerp(previous[2], indices, weights); + + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + apply(previous[0], indices, weights, frictionImpulse.x * -weights.w); + apply(previous[1], indices, weights, frictionImpulse.y * -weights.w); + apply(previous[2], indices, weights, frictionImpulse.z * -weights.w); + } + + if(massScaleEnabled) + { + float deltaLengthSq = (delta.x * delta.x + delta.y * delta.y + delta.z * delta.z) * scale * scale; + float invMassScale = __fdividef(1.0f, 1.0f + gClothData.mCollisionMassScale * deltaLengthSq); + + // not multiplying by weights[3] here because unlike applying velocity + // deltas where we want the interpolated position to obtain a particular + // value, we instead just require that the total change is equal to invMassScale + invMassScale = invMassScale - 1.0f; + current(indices.x, 3) *= 1.0f + weights.x * invMassScale; + current(indices.y, 3) *= 1.0f + weights.y * invMassScale; + current(indices.z, 3) *= 1.0f + weights.z * invMassScale; + } + } + } + } +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideContinuousCapsules(CurrentT& current, PreviousT& previous) const +{ + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename PreviousT::VectorType prevPos = previous(i); + typename CurrentT::VectorType curPos = current(i); + + float3 delta, velocity; + if(int32_t numCollisions = collideCapsules(prevPos, curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + + if(frictionEnabled) + { + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + + if(massScaleEnabled) + { + float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z; + float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq; + current(i, 3) = __fdividef(current(i, 3), massScale); + } + } + } +} + +template <typename CurPos> +__device__ int32_t CuCollision::collideConvexes(const CurPos& positions, float3& delta) const +{ + delta.x = delta.y = delta.z = 0.0f; + + Pointer<Shared, const float> planeX = mCurData.mSphereX; + Pointer<Shared, const float> planeY = planeX + gClothData.mNumPlanes; + Pointer<Shared, const float> planeZ = planeY + gClothData.mNumPlanes; + Pointer<Shared, const float> planeW = planeZ + gClothData.mNumPlanes; + + int32_t numCollisions = 0; + Pointer<Shared, const uint32_t> cIt = mConvexMasks; + Pointer<Shared, const uint32_t> cEnd = cIt + gClothData.mNumConvexes; + for(; cIt != cEnd; ++cIt) + { + uint32_t mask = *cIt; + + int32_t maxIndex = __ffs(mask) - 1; + float maxDist = planeW[maxIndex] + positions.z * planeZ[maxIndex] + positions.y * planeY[maxIndex] + + positions.x * planeX[maxIndex]; + + while((maxDist < 0.0f) && (mask &= mask - 1)) + { + int32_t i = __ffs(mask) - 1; + float dist = planeW[i] + positions.z * planeZ[i] + positions.y * planeY[i] + positions.x * planeX[i]; + if(dist > maxDist) + maxDist = dist, maxIndex = i; + } + + if(maxDist < 0.0f) + { + delta.x -= planeX[maxIndex] * maxDist; + delta.y -= planeY[maxIndex] * maxDist; + delta.z -= planeZ[maxIndex] * maxDist; + + ++numCollisions; + } + } + + return numCollisions; +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideConvexes(CurrentT& current, PreviousT& previous, float alpha) +{ + if(!gClothData.mNumConvexes) + return; + + // interpolate planes and transpose + if(threadIdx.x < gClothData.mNumPlanes * 4) + { + float start = gFrameData.mStartCollisionPlanes[threadIdx.x]; + float target = gFrameData.mTargetCollisionPlanes[threadIdx.x]; + int32_t j = threadIdx.x % 4 * gClothData.mNumPlanes + threadIdx.x / 4; + mCurData.mSphereX[j] = start + (target - start) * alpha; + } + + __syncthreads(); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename CurrentT::VectorType curPos = current(i); + + float3 delta; + if(int32_t numCollisions = collideConvexes(curPos, delta)) + { + float scale = __fdividef(1.0f, numCollisions); + + if(frictionEnabled) + { + typename PreviousT::VectorType prevPos = previous(i); + + float3 frictionImpulse = + calcFrictionImpulse(prevPos, curPos, make_float3(0.0f, 0.0f, 0.0f), scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + } + } + + __syncthreads(); +} + +namespace +{ +struct TriangleData +{ + float baseX, baseY, baseZ; + float edge0X, edge0Y, edge0Z; + float edge1X, edge1Y, edge1Z; + float normalX, normalY, normalZ; + + float edge0DotEdge1; + float edge0SqrLength; + float edge1SqrLength; + + float det; + float denom; + + float edge0InvSqrLength; + float edge1InvSqrLength; + + // initialize struct after vertices have been stored in first 9 members + __device__ void initialize() + { + edge0X -= baseX, edge0Y -= baseY, edge0Z -= baseZ; + edge1X -= baseX, edge1Y -= baseY, edge1Z -= baseZ; + + normalX = edge0Y * edge1Z - edge0Z * edge1Y; + normalY = edge0Z * edge1X - edge0X * edge1Z; + normalZ = edge0X * edge1Y - edge0Y * edge1X; + + float normalInvLength = rsqrtf(normalX * normalX + normalY * normalY + normalZ * normalZ); + normalX *= normalInvLength; + normalY *= normalInvLength; + normalZ *= normalInvLength; + + edge0DotEdge1 = edge0X * edge1X + edge0Y * edge1Y + edge0Z * edge1Z; + edge0SqrLength = edge0X * edge0X + edge0Y * edge0Y + edge0Z * edge0Z; + edge1SqrLength = edge1X * edge1X + edge1Y * edge1Y + edge1Z * edge1Z; + + det = __fdividef(1.0f, edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1); + denom = __fdividef(1.0f, edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1); + + edge0InvSqrLength = __fdividef(1.0f, edge0SqrLength); + edge1InvSqrLength = __fdividef(1.0f, edge1SqrLength); + } +}; +} + +template <typename CurrentT> +__device__ void CuCollision::collideTriangles(CurrentT& current, int32_t i) +{ + float posX = current(i, 0); + float posY = current(i, 1); + float posZ = current(i, 2); + + const TriangleData* __restrict tIt = reinterpret_cast<const TriangleData*>(generic(mCurData.mSphereX)); + const TriangleData* __restrict tEnd = tIt + gClothData.mNumCollisionTriangles; + + float normalX, normalY, normalZ, normalD = 0.0f; + float minSqrLength = FLT_MAX; + + for(; tIt != tEnd; ++tIt) + { + float dx = posX - tIt->baseX; + float dy = posY - tIt->baseY; + float dz = posZ - tIt->baseZ; + + float deltaDotEdge0 = dx * tIt->edge0X + dy * tIt->edge0Y + dz * tIt->edge0Z; + float deltaDotEdge1 = dx * tIt->edge1X + dy * tIt->edge1Y + dz * tIt->edge1Z; + float deltaDotNormal = dx * tIt->normalX + dy * tIt->normalY + dz * tIt->normalZ; + + float s = tIt->edge1SqrLength * deltaDotEdge0 - tIt->edge0DotEdge1 * deltaDotEdge1; + float t = tIt->edge0SqrLength * deltaDotEdge1 - tIt->edge0DotEdge1 * deltaDotEdge0; + + s = t > 0.0f ? s * tIt->det : deltaDotEdge0 * tIt->edge0InvSqrLength; + t = s > 0.0f ? t * tIt->det : deltaDotEdge1 * tIt->edge1InvSqrLength; + + if(s + t > 1.0f) + { + s = (tIt->edge1SqrLength - tIt->edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1) * tIt->denom; + } + + s = fmaxf(0.0f, fminf(1.0f, s)); + t = fmaxf(0.0f, fminf(1.0f - s, t)); + + dx = dx - tIt->edge0X * s - tIt->edge1X * t; + dy = dy - tIt->edge0Y * s - tIt->edge1Y * t; + dz = dz - tIt->edge0Z * s - tIt->edge1Z * t; + + float sqrLength = dx * dx + dy * dy + dz * dz; + + if(0.0f > deltaDotNormal) + sqrLength *= 1.0001f; + + if(sqrLength < minSqrLength) + { + normalX = tIt->normalX; + normalY = tIt->normalY; + normalZ = tIt->normalZ; + normalD = deltaDotNormal; + minSqrLength = sqrLength; + } + } + + if(normalD < 0.0f) + { + current(i, 0) = posX - normalX * normalD; + current(i, 1) = posY - normalY * normalD; + current(i, 2) = posZ - normalZ * normalD; + } +} + +namespace +{ +static const int32_t sTrianglePadding = sizeof(TriangleData) / sizeof(float) - 9; +} + +template <typename CurrentT> +__device__ void CuCollision::collideTriangles(CurrentT& current, float alpha) +{ + if(!gClothData.mNumCollisionTriangles) + return; + + // interpolate triangle vertices and store in shared memory + for(int32_t i = threadIdx.x, n = gClothData.mNumCollisionTriangles * 9; i < n; i += blockDim.x) + { + float start = gFrameData.mStartCollisionTriangles[i]; + float target = gFrameData.mTargetCollisionTriangles[i]; + int32_t idx = i * 7282 >> 16; // same as i/9 + int32_t offset = i + idx * sTrianglePadding; + mCurData.mSphereX[offset] = start + (target - start) * alpha; + } + + __syncthreads(); + + for(int32_t i = threadIdx.x; i < gClothData.mNumCollisionTriangles; i += blockDim.x) + { + reinterpret_cast<TriangleData*>(generic(mCurData.mSphereX))[i].initialize(); + } + + __syncthreads(); + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + collideTriangles(current, i); + + __syncthreads(); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp new file mode 100644 index 00000000..2ccc3db9 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp @@ -0,0 +1,54 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "cudamanager/PxCudaContextManager.h" +#include "CuContextLock.h" +#include "CuFactory.h" + +using namespace physx; + +cloth::CuContextLock::CuContextLock(const CuFactory& factory) : mFactory(factory) +{ + acquire(); +} + +cloth::CuContextLock::~CuContextLock() +{ + release(); +} + +void cloth::CuContextLock::acquire() +{ + mFactory.mContextManager->acquireContext(); +} + +void cloth::CuContextLock::release() +{ + mFactory.mContextManager->releaseContext(); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h new file mode 100644 index 00000000..50e48b49 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h @@ -0,0 +1,57 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +namespace physx +{ + +namespace cloth +{ + +class CuFactory; + +// acquires cuda context for the lifetime of the instance +class CuContextLock +{ + protected: + CuContextLock(const CuContextLock&); + CuContextLock& operator=(const CuContextLock&); + + public: + CuContextLock(const CuFactory&); + ~CuContextLock(); + + void acquire(); + void release(); + + const CuFactory& mFactory; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h new file mode 100644 index 00000000..cb37b39d --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h @@ -0,0 +1,216 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <cuda.h> +#include "CuCheckSuccess.h" + +namespace physx +{ + +namespace cloth +{ + +template <typename T> +struct RemoveConst +{ + typedef T Type; +}; +template <typename T> +struct RemoveConst<const T> +{ + typedef T Type; +}; + +template <typename> +class CuDeviceReference; // forward declare + +// pointer to POD type in CUDA device memory +template <typename T> +class CuDevicePointer +{ + template <typename> + friend class CuDevicePointer; + + typedef typename RemoveConst<T>::Type ValueType; + + public: + // c'tors + CuDevicePointer() : mPointer(0) + { + } + template <class U> + explicit CuDevicePointer(U* ptr) + : mPointer(ptr) + { + } + CuDevicePointer(const CuDevicePointer<ValueType>& ptr) : mPointer(ptr.get()) + { + } + + // conversion + template <typename U> + operator CuDevicePointer<U>(void) const + { + return CuDevicePointer<U>(static_cast<U*>(mPointer)); + } + T* get() const + { + return mPointer; + } + CUdeviceptr dev() const + { + return reinterpret_cast<CUdeviceptr>(mPointer); + } + + // operators + CuDevicePointer operator+(const ptrdiff_t& rhs) const + { + return CuDevicePointer(mPointer + rhs); + } + CuDevicePointer operator-(const ptrdiff_t& rhs) const + { + return CuDevicePointer(mPointer - rhs); + } + CuDevicePointer& operator++(void) + { + ++mPointer; + return *this; + } + CuDevicePointer operator++(int) + { + CuDevicePointer copy(*this); + ++(*this); + return copy; + } + CuDevicePointer& operator--(void) + { + --mPointer; + return *this; + } + CuDevicePointer operator--(int) + { + CuDevicePointer copy(*this); + --(*this); + return copy; + } + CuDevicePointer& operator+=(ptrdiff_t rhs) + { + mPointer += rhs; + return *this; + } + CuDevicePointer& operator-=(ptrdiff_t rhs) + { + mPointer -= rhs; + return *this; + } + ptrdiff_t operator-(const CuDevicePointer& rhs) const + { + return mPointer - rhs.mPointer; + } + + template <typename U> + bool operator==(const CuDevicePointer<U>& other) const + { + return mPointer == other.mPointer; + } + template <typename U> + bool operator!=(const CuDevicePointer<U>& other) const + { + return mPointer != other.mPointer; + } + + // dereference + CuDeviceReference<T> operator[](const ptrdiff_t&) const; // (implemented below) + CuDeviceReference<T> operator*(void) const + { + return operator[](0); + } + + private: + T* mPointer; +}; + +template <typename T> +class CuDeviceReference +{ + template <typename> + friend class CuDeviceReference; + template <typename> + friend class CuDevicePointer; + + typedef typename RemoveConst<T>::Type ValueType; + + template <typename U> + CuDeviceReference(CuDevicePointer<U> pointer) + : mPointer(static_cast<T*>(pointer.get())) + { + } + + public: + template <typename U> + CuDeviceReference(CuDeviceReference<U> reference) + : mPointer(static_cast<T*>(reference.mPointer)) + { + } + + CuDevicePointer<T> operator&() const + { + return CuDevicePointer<T>(mPointer); + } + + CuDeviceReference& operator=(const T& v) + { + checkSuccess(cuMemcpyHtoD(CUdeviceptr(mPointer), &v, sizeof(T))); + return *this; + } + CuDeviceReference& operator=(const CuDeviceReference& ref) + { + checkSuccess(cuMemcpyDtoD(CUdeviceptr(mPointer), CUdeviceptr(ref.mPointer), sizeof(T))); + return *this; + } + operator ValueType() const + { + ValueType result; + checkSuccess(cuMemcpyDtoH(&result, CUdeviceptr(mPointer), sizeof(T))); + return result; + } + + private: + T* mPointer; +}; +} + +template <typename T> +cloth::CuDeviceReference<T> cloth::CuDevicePointer<T>::operator[](const ptrdiff_t& i) const +{ + return CuDeviceReference<T>(*this + i); +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h new file mode 100644 index 00000000..e3997d26 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h @@ -0,0 +1,258 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxMath.h" // for swap +#include "cudamanager/PxCudaMemoryManager.h" +#include "cudamanager/PxCudaContextManager.h" +#include "CuDevicePointer.h" +#include "PsArray.h" +#include "PsUtilities.h" + +namespace physx +{ +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch +#endif + +namespace cloth +{ + +// STL-style vector that holds POD types in CUDA device memory. The interface +// is not complete, add whatever you need from the std::vector interface. +template <typename T> +class CuDeviceVector +{ + public: + typedef CuDevicePointer<T> iterator; + typedef CuDevicePointer<const T> const_iterator; + + CuDeviceVector(physx::PxCudaContextManager* ctx) : mManager(0) + { + PX_ASSERT(ctx); + + if(ctx) + mManager = ctx->getMemoryManager(); + } + + CuDeviceVector(const CuDeviceVector& other) : mManager(other.getMemoryManager()) + { + PX_ASSERT(mManager); + + operator=(other); + } + + CuDeviceVector(physx::PxCudaContextManager* ctx, const T* first, const T* last) : mManager(0) + { + PX_ASSERT(ctx); + + if(ctx) + { + mManager = ctx->getMemoryManager(); + assign(first, last); + } + } + + template <typename Alloc> + CuDeviceVector(const shdfnd::Array<T, Alloc>& other) + { + operator=(other); + } + + ~CuDeviceVector() + { + PX_ASSERT(mManager); + + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, mFirst.dev()); + } + + CuDeviceVector& operator=(const CuDeviceVector& other) + { + resize(other.size()); + checkSuccess(cuMemcpyDtoD(mFirst.dev(), other.mFirst.dev(), other.size() * sizeof(T))); + return *this; + } + + template <typename Alloc> + CuDeviceVector& operator=(const shdfnd::Array<T, Alloc>& other) + { + const T* first = other.empty() ? 0 : &other.front(); + assign(first, first + other.size()); + return *this; + } + + bool empty() const + { + return mLast == mFirst; + } + size_t size() const + { + return size_t(mLast - mFirst); + } + size_t capacity() const + { + return mEnd - mFirst; + } + + iterator begin() + { + return mFirst; + } + iterator end() + { + return mLast; + } + const_iterator begin() const + { + return mFirst; + } + const_iterator end() const + { + return mLast; + } + + void push_back(const T& v) + { + if(mLast == mEnd) + reserve(PxMax<size_t>(1, capacity() * 2)); + + *mLast++ = v; + } + + void push_back(const T* first, const T* last) + { + if(mEnd - mLast < last - first) + reserve(PxMax<size_t>(2 * capacity(), mLast - mFirst + last - first)); + + if(first != last) + checkSuccess(cuMemcpyHtoD(mLast.dev(), first, sizeof(T) * (last - first))); + + mLast += last - first; + } + + void erase(iterator it) + { + size_t byteSize = (mLast - it - 1) * sizeof(T); + if(byteSize) + { + CUdeviceptr tmp = 0, dst = it.dev(); + + PX_ASSERT(mManager); + + tmp = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, byteSize, + PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH)); + checkSuccess(cuMemcpyDtoD(tmp, dst + sizeof(T), byteSize)); + checkSuccess(cuMemcpyDtoD(dst, tmp, byteSize)); + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, tmp); + } + --mLast; + } + + void reserve(size_t n) + { + if(n <= capacity()) + return; + + CUdeviceptr newFirst = 0, oldFirst = mFirst.dev(); + + PX_ASSERT(mManager); + + newFirst = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, sizeof(T) * n, + PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH)); + checkSuccess(cuMemcpyDtoD(newFirst, oldFirst, sizeof(T) * size())); + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, oldFirst); + + iterator first(reinterpret_cast<T*>(newFirst)); + mEnd = first + n; + mLast = first + size(); + mFirst = first; + } + + void resize(size_t n) + { + if(capacity() < n) + reserve(PxMax(n, capacity() * 2)); + + mLast = mFirst + n; + } + + void assign(const T* first, const T* last) + { + size_t n = last - first; + resize(n); + checkSuccess(cuMemcpyHtoD(mFirst.dev(), first, n * sizeof(T))); + } + + void swap(CuDeviceVector& other) + { + shdfnd::swap(mFirst, other.mFirst); + shdfnd::swap(mLast, other.mLast); + shdfnd::swap(mEnd, other.mEnd); + } + + // match PxArray interface + void remove(size_t i) + { + erase(begin() + i); + } + void pushBack(const T& v) + { + push_back(v); + } + + physx::PxCudaMemoryManager* getMemoryManager() const + { + return mManager; + } + + private: + iterator mFirst, mLast, mEnd; + physx::PxCudaMemoryManager* mManager; +}; + +} // namespace cloth +} // namespace physx + +#if PX_VC +#pragma warning(pop) +#endif + +namespace physx +{ +namespace shdfnd +{ +template <typename T> +void swap(physx::cloth::CuDeviceVector<T>& first, physx::cloth::CuDeviceVector<T>& second) +{ + first.swap(second); +} +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp new file mode 100644 index 00000000..7f8326fe --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp @@ -0,0 +1,197 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "CuFabric.h" +#include "CuContextLock.h" +#include "CuFactory.h" + +#if PX_VC +#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch +#endif + +using namespace physx; +using namespace shdfnd; + +cloth::CuTether::CuTether(uint16_t anchor, uint16_t length) : mAnchor(anchor), mLength(length) +{ +} + +cloth::CuFabric::CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths, + Range<const uint32_t> triangles, uint32_t id) +: CuContextLock(factory) +, mFactory(factory) +, mNumParticles(numParticles) +, mPhases(mFactory.mContextManager, phases.begin(), phases.end()) +, mSets(mFactory.mContextManager) +, mRestvalues(mFactory.mContextManager, restvalues.begin(), restvalues.end()) +, mIndices(mFactory.mContextManager) +, mTethers(mFactory.mContextManager) +, mTriangles(mFactory.mContextManager) +, mId(id) +{ + // should no longer be prefixed with 0 + PX_ASSERT(sets.front() != 0); + + PX_ASSERT(sets.back() == restvalues.size()); + PX_ASSERT(restvalues.size() * 2 == indices.size()); + PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end())); + + // copy to device, add leading zero + mSets.reserve(sets.size() + 1); + mSets.push_back(0); + mSets.push_back(sets.begin(), sets.end()); + + // manually convert uint32_t indices to uint16_t in temp memory + Vector<uint16_t>::Type hostIndices; + hostIndices.resizeUninitialized(indices.size()); + Vector<uint16_t>::Type::Iterator dIt = hostIndices.begin(); + + const uint32_t* it = indices.begin(); + const uint32_t* end = indices.end(); + for(; it != end; ++it, ++dIt) + *dIt = uint16_t(*it); + + // copy to device vector in one go + mIndices.assign(hostIndices.begin(), hostIndices.end()); + + // gather data per phase + mNumConstraintsInPhase.reserve(phases.size()); + CuDevicePointer<const float> devRestvalues = mRestvalues.begin(); + CuDevicePointer<const uint16_t> devIndices = mIndices.begin(); + for(const uint32_t* pIt = phases.begin(); pIt != phases.end(); ++pIt) + { + uint32_t setIndex = *pIt; + uint32_t firstIndex = setIndex ? sets[setIndex - 1] : 0; + uint32_t lastIndex = sets[setIndex]; + mNumConstraintsInPhase.pushBack(lastIndex - firstIndex); + mRestvaluesInPhase.pushBack(devRestvalues + firstIndex); + mIndicesInPhase.pushBack(devIndices + 2 * firstIndex); + } + + // tethers + PX_ASSERT(anchors.size() == tetherLengths.size()); + mTetherLengthScale = + tetherLengths.empty() ? 1.0f : *maxElement(tetherLengths.begin(), tetherLengths.end()) / USHRT_MAX; + float inverseScale = 1 / (mTetherLengthScale + FLT_EPSILON); + Vector<CuTether>::Type tethers; + tethers.reserve(anchors.size()); + for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront()) + { + tethers.pushBack(CuTether(uint16_t(anchors.front()), uint16_t(tetherLengths.front() * inverseScale + 0.5f))); + } + mTethers.assign(tethers.begin(), tethers.end()); + + // triangles + hostIndices.resizeUninitialized(triangles.size()); + dIt = hostIndices.begin(); + + it = triangles.begin(); + end = triangles.end(); + for(; it != end; ++it, ++dIt) + *dIt = uint16_t(*it); + + mTriangles.assign(hostIndices.begin(), hostIndices.end()); + + CuContextLock::release(); + + // add to factory + mFactory.mFabrics.pushBack(this); +} + +cloth::CuFabric::~CuFabric() +{ + CuContextLock::acquire(); + + Vector<CuFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this); + + PX_ASSERT(fIt != mFactory.mFabrics.end()); + mFactory.mFabrics.replaceWithLast(fIt); +} + +cloth::Factory& physx::cloth::CuFabric::getFactory() const +{ + return mFactory; +} + +uint32_t cloth::CuFabric::getNumPhases() const +{ + return uint32_t(mPhases.size()); +} + +uint32_t cloth::CuFabric::getNumRestvalues() const +{ + return uint32_t(mRestvalues.size()); +} + +uint32_t cloth::CuFabric::getNumSets() const +{ + return uint32_t(mSets.size() - 1); +} + +uint32_t cloth::CuFabric::getNumIndices() const +{ + return uint32_t(mIndices.size()); +} + +uint32_t cloth::CuFabric::getNumParticles() const +{ + return mNumParticles; +} + +uint32_t physx::cloth::CuFabric::getNumTethers() const +{ + return uint32_t(mTethers.size()); +} + +uint32_t physx::cloth::CuFabric::getNumTriangles() const +{ + return uint32_t(mTriangles.size()) / 3; +} + +void physx::cloth::CuFabric::scaleRestvalues(float scale) +{ + CuContextLock contextLock(mFactory); + + Vector<float>::Type restvalues(uint32_t(mRestvalues.size())); + mFactory.copyToHost(mRestvalues.begin().get(), mRestvalues.end().get(), restvalues.begin()); + + Vector<float>::Type::Iterator rIt, rEnd = restvalues.end(); + for(rIt = restvalues.begin(); rIt != rEnd; ++rIt) + *rIt *= scale; + + mRestvalues = restvalues; +} + +void physx::cloth::CuFabric::scaleTetherLengths(float scale) +{ + // cloth instances won't pick this up until CuClothData is dirty! + mTetherLengthScale *= scale; +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h new file mode 100644 index 00000000..93f787f8 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h @@ -0,0 +1,102 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Fabric.h" +#include "Range.h" +#include "Types.h" +#include "Allocator.h" +#include "CuContextLock.h" +#include "CuDeviceVector.h" + +namespace physx +{ + +namespace cloth +{ + +struct CuTether +{ + CuTether(uint16_t, uint16_t); + uint16_t mAnchor; + uint16_t mLength; +}; + +class CuFabric : public UserAllocated, private CuContextLock, public Fabric +{ + PX_NOCOPY(CuFabric) + public: + CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths, Range<const uint32_t> triangles, uint32_t id); + + virtual ~CuFabric(); + + virtual Factory& getFactory() const; + + virtual uint32_t getNumPhases() const; + virtual uint32_t getNumRestvalues() const; + + virtual uint32_t getNumSets() const; + virtual uint32_t getNumIndices() const; + + virtual uint32_t getNumParticles() const; + + virtual uint32_t getNumTethers() const; + + virtual uint32_t getNumTriangles() const; + + virtual void scaleRestvalues(float); + virtual void scaleTetherLengths(float); + + public: + CuFactory& mFactory; + + uint32_t mNumParticles; + + CuDeviceVector<uint32_t> mPhases; // index of set to use + CuDeviceVector<uint32_t> mSets; // offset of first restvalue, with 0 prefix + + CuDeviceVector<float> mRestvalues; + CuDeviceVector<uint16_t> mIndices; + + CuDeviceVector<CuTether> mTethers; + float mTetherLengthScale; + + CuDeviceVector<uint16_t> mTriangles; + + Vector<uint32_t>::Type mNumConstraintsInPhase; + Vector<CuDevicePointer<const float> >::Type mRestvaluesInPhase; + Vector<CuDevicePointer<const uint16_t> >::Type mIndicesInPhase; + + uint32_t mId; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp new file mode 100644 index 00000000..8847780e --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp @@ -0,0 +1,398 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxMemory.h" +#include "CuFactory.h" +#include "CuFabric.h" +#include "CuCloth.h" +#include "CuSolver.h" +#include "ClothImpl.h" +#include "CuCheckSuccess.h" +#include "CuContextLock.h" +#include "PsAllocator.h" +#include "Array.h" +#include "PsFoundation.h" +#include <cuda.h> + +#if PX_VC +#pragma warning(disable : 4061 4062) // enumerator 'identifier' in switch of enum 'enumeration' is not handled +#endif + +using namespace physx; +using namespace shdfnd; + +namespace physx +{ +namespace cloth +{ +// defined in Factory.cpp +uint32_t getNextFabricId(); + +typedef Vec4T<uint32_t> Vec4u; +} +} + +void cloth::checkSuccessImpl(CUresult err, const char* file, const int line) +{ + if(err != CUDA_SUCCESS) + { + const char* code = "Unknown"; + switch(err) + { +#define ADD_CASE(X) \ + case X: \ + code = #X; \ + break + ADD_CASE(CUDA_ERROR_INVALID_VALUE); + ADD_CASE(CUDA_ERROR_OUT_OF_MEMORY); + ADD_CASE(CUDA_ERROR_NOT_INITIALIZED); + ADD_CASE(CUDA_ERROR_DEINITIALIZED); + ADD_CASE(CUDA_ERROR_NO_DEVICE); + ADD_CASE(CUDA_ERROR_INVALID_DEVICE); + ADD_CASE(CUDA_ERROR_INVALID_IMAGE); + ADD_CASE(CUDA_ERROR_INVALID_CONTEXT); + ADD_CASE(CUDA_ERROR_MAP_FAILED); + ADD_CASE(CUDA_ERROR_UNMAP_FAILED); + ADD_CASE(CUDA_ERROR_ARRAY_IS_MAPPED); + ADD_CASE(CUDA_ERROR_ALREADY_MAPPED); + ADD_CASE(CUDA_ERROR_NO_BINARY_FOR_GPU); + ADD_CASE(CUDA_ERROR_ALREADY_ACQUIRED); + ADD_CASE(CUDA_ERROR_NOT_MAPPED); + ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY); + ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_POINTER); + ADD_CASE(CUDA_ERROR_ECC_UNCORRECTABLE); + ADD_CASE(CUDA_ERROR_UNSUPPORTED_LIMIT); + ADD_CASE(CUDA_ERROR_INVALID_SOURCE); + ADD_CASE(CUDA_ERROR_FILE_NOT_FOUND); + ADD_CASE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND); + ADD_CASE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED); + ADD_CASE(CUDA_ERROR_OPERATING_SYSTEM); + ADD_CASE(CUDA_ERROR_INVALID_HANDLE); + ADD_CASE(CUDA_ERROR_NOT_FOUND); + ADD_CASE(CUDA_ERROR_NOT_READY); + ADD_CASE(CUDA_ERROR_LAUNCH_FAILED); + ADD_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); + ADD_CASE(CUDA_ERROR_LAUNCH_TIMEOUT); + ADD_CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING); + default: + ADD_CASE(CUDA_ERROR_UNKNOWN); +#undef ADD_CASE + } + shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, file, line, "CUDA error: %s", code); + } +} + +namespace +{ +// returns max threads as specified by launch bounds in CuSolverKernel.cu +uint32_t getMaxThreadsPerBlock(const physx::PxCudaContextManager& contextManager) +{ + if(contextManager.supportsArchSM30()) + return 1024; + + if(contextManager.supportsArchSM20()) + return 512; + + return 192; +} +} + +cloth::CuFactory::CuFactory(physx::PxCudaContextManager* contextManager) +: Factory(CUDA) +, mContextManager(contextManager) +, mNumThreadsPerBlock(getMaxThreadsPerBlock(*contextManager)) +, mMaxThreadsPerBlock(mNumThreadsPerBlock) +{ +} + +cloth::CuFactory::~CuFactory() +{ +} + +cloth::Fabric* cloth::CuFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, + Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths, Range<const uint32_t> triangles) +{ + return new CuFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths, triangles, + getNextFabricId()); +} + +cloth::Cloth* cloth::CuFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric) +{ + return new CuClothImpl(*this, fabric, particles); +} + +cloth::Solver* cloth::CuFactory::createSolver(physx::PxTaskManager*) +{ + CuSolver* solver = new CuSolver(*this); + + if(solver->hasError()) + { + delete solver; + return NULL; + } + + return solver; +} + +// CuFactory::clone() implemented in CuClothClone.cpp + +void cloth::CuFactory::copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const +{ + CuContextLock contextLock(*this); + + checkSuccess(cuMemcpyDtoH(dstIt, CUdeviceptr(srcIt), size_t(intptr_t(srcEnd) - intptr_t(srcIt)))); +} + +void cloth::CuFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths, Range<uint32_t> triangles) const +{ + CuContextLock contextLock(*this); + + const CuFabric& cuFabric = static_cast<const CuFabric&>(fabric); + + if(!phases.empty()) + { + PX_ASSERT(phases.size() == cuFabric.mPhases.size()); + const uint32_t* devicePhases = cuFabric.mPhases.begin().get(); + copyToHost(devicePhases, devicePhases + cuFabric.mPhases.size(), phases.begin()); + } + + if(!restvalues.empty()) + { + PX_ASSERT(restvalues.size() == cuFabric.mRestvalues.size()); + const float* deviceRestvalues = cuFabric.mRestvalues.begin().get(); + copyToHost(deviceRestvalues, deviceRestvalues + cuFabric.mRestvalues.size(), restvalues.begin()); + } + + if(!sets.empty()) + { + PX_ASSERT(sets.size() == cuFabric.mSets.size() - 1); + const uint32_t* deviceSets = cuFabric.mSets.begin().get(); + copyToHost(deviceSets + 1, deviceSets + cuFabric.mSets.size(), sets.begin()); + } + + if(!indices.empty()) + { + PX_ASSERT(indices.size() == cuFabric.mIndices.size()); + const uint16_t* deviceIndices = cuFabric.mIndices.begin().get(); + uint16_t* hostIndices = reinterpret_cast<uint16_t*>(indices.begin()); + copyToHost(deviceIndices, deviceIndices + cuFabric.mIndices.size(), hostIndices); + + // convert from 16bit to 32bit indices + for(uint32_t i = indices.size(); 0 < i--;) + indices[i] = hostIndices[i]; + } + + if(!anchors.empty() || !tetherLengths.empty()) + { + uint32_t numTethers = uint32_t(cuFabric.mTethers.size()); + Vector<CuTether>::Type tethers(numTethers, CuTether(0, 0)); + const CuTether* deviceTethers = cuFabric.mTethers.begin().get(); + copyToHost(deviceTethers, deviceTethers + numTethers, tethers.begin()); + + PX_ASSERT(anchors.empty() || anchors.size() == tethers.size()); + for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront()) + anchors.front() = tethers[i].mAnchor; + + PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == tethers.size()); + for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront()) + tetherLengths.front() = tethers[i].mLength * cuFabric.mTetherLengthScale; + } + + if(!triangles.empty()) + { + // todo triangles + } +} + +void cloth::CuFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + PX_ASSERT(spheres.empty() || spheres.size() == cuCloth.mStartCollisionSpheres.size()); + PX_ASSERT(capsules.empty() || capsules.size() == cuCloth.mCapsuleIndices.size() * 2); + PX_ASSERT(planes.empty() || planes.size() == cuCloth.mStartCollisionPlanes.size()); + PX_ASSERT(convexes.empty() || convexes.size() == cuCloth.mConvexMasks.size()); + PX_ASSERT(triangles.empty() || triangles.size() == cuCloth.mStartCollisionTriangles.size()); + + // collision spheres are in pinned memory, so memcpy directly + if(!cuCloth.mStartCollisionSpheres.empty() && !spheres.empty()) + memcpy(spheres.begin(), &cuCloth.mStartCollisionSpheres.front(), + cuCloth.mStartCollisionSpheres.size() * sizeof(PxVec4)); + + if(!cuCloth.mCapsuleIndices.empty() && !capsules.empty()) + memcpy(capsules.begin(), &cuCloth.mCapsuleIndices.front(), cuCloth.mCapsuleIndices.size() * sizeof(IndexPair)); + + if(!cuCloth.mStartCollisionPlanes.empty() && !planes.empty()) + memcpy(planes.begin(), &cuCloth.mStartCollisionPlanes.front(), + cuCloth.mStartCollisionPlanes.size() * sizeof(PxVec4)); + + if(!cuCloth.mConvexMasks.empty() && !convexes.empty()) + memcpy(convexes.begin(), &cuCloth.mConvexMasks.front(), cuCloth.mConvexMasks.size() * sizeof(uint32_t)); + + if(!cuCloth.mStartCollisionTriangles.empty() && !triangles.empty()) + memcpy(triangles.begin(), &cuCloth.mStartCollisionTriangles.front(), + cuCloth.mStartCollisionTriangles.size() * sizeof(PxVec3)); +} + +void cloth::CuFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(cuCloth.mMotionConstraints.mHostCopy.size()) + { + PX_ASSERT(destConstraints.size() == cuCloth.mMotionConstraints.mHostCopy.size()); + + PxMemCopy(destConstraints.begin(), cuCloth.mMotionConstraints.mHostCopy.begin(), + sizeof(PxVec4) * cuCloth.mMotionConstraints.mHostCopy.size()); + } + else + { + CuContextLock contextLock(*this); + + CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mMotionConstraints.mTarget.empty() + ? cuCloth.mMotionConstraints.mTarget + : cuCloth.mMotionConstraints.mStart; + + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin()); + } +} + +void cloth::CuFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(cuCloth.mSeparationConstraints.mHostCopy.size()) + { + PX_ASSERT(destConstraints.size() == cuCloth.mSeparationConstraints.mHostCopy.size()); + + PxMemCopy(destConstraints.begin(), cuCloth.mSeparationConstraints.mHostCopy.begin(), + sizeof(PxVec4) * cuCloth.mSeparationConstraints.mHostCopy.size()); + } + else + { + CuContextLock contextLock(*this); + + CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mSeparationConstraints.mTarget.empty() + ? cuCloth.mSeparationConstraints.mTarget + : cuCloth.mSeparationConstraints.mStart; + + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin()); + } +} + +void cloth::CuFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(cuCloth.mParticleAccelerationsHostCopy.size()) + { + PX_ASSERT(destAccelerations.size() == cuCloth.mParticleAccelerationsHostCopy.size()); + + PxMemCopy(destAccelerations.begin(), cuCloth.mParticleAccelerationsHostCopy.begin(), + sizeof(PxVec4) * cuCloth.mParticleAccelerationsHostCopy.size()); + } +} + +void cloth::CuFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + CuContextLock contextLock(*this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(destWeights.size() > 0) + { + uint32_t numWeights = cloth.getNumVirtualParticleWeights(); + + Vector<PxVec4>::Type hostWeights(numWeights, PxVec4(0.0f)); + copyToHost(cuCloth.mVirtualParticleWeights.begin().get(), cuCloth.mVirtualParticleWeights.end().get(), + &hostWeights.front()); + + // convert weights to Vec3f + PxVec3* destIt = reinterpret_cast<PxVec3*>(destWeights.begin()); + Vector<PxVec4>::Type::ConstIterator srcIt = hostWeights.begin(); + Vector<PxVec4>::Type::ConstIterator srcEnd = srcIt + numWeights; + for(; srcIt != srcEnd; ++srcIt, ++destIt) + *destIt = reinterpret_cast<const PxVec3&>(*srcIt); + + PX_ASSERT(destIt <= destWeights.end()); + } + + if(destIndices.size() > 0) + { + uint32_t numIndices = cloth.getNumVirtualParticles(); + + Vector<Vec4us>::Type hostIndices(numIndices); + copyToHost(cuCloth.mVirtualParticleIndices.begin().get(), cuCloth.mVirtualParticleIndices.end().get(), + &hostIndices.front()); + + // convert indices to 32 bit + Vec4u* destIt = reinterpret_cast<Vec4u*>(destIndices.begin()); + Vector<Vec4us>::Type::ConstIterator srcIt = hostIndices.begin(); + Vector<Vec4us>::Type::ConstIterator srcEnd = srcIt + numIndices; + for(; srcIt != srcEnd; ++srcIt, ++destIt) + *destIt = Vec4u(*srcIt); + + PX_ASSERT(&array(*destIt) <= destIndices.end()); + } +} + +void cloth::CuFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const +{ + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + PX_ASSERT(destIndices.size() == cuCloth.mSelfCollisionIndices.size()); + copyToHost(cuCloth.mSelfCollisionIndices.begin().get(), cuCloth.mSelfCollisionIndices.end().get(), + destIndices.begin()); +} + +void cloth::CuFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const +{ + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + PX_ASSERT(destRestPositions.size() == cuCloth.mRestPositions.size()); + copyToHost(cuCloth.mRestPositions.begin().get(), cuCloth.mRestPositions.end().get(), destRestPositions.begin()); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h new file mode 100644 index 00000000..e868034f --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h @@ -0,0 +1,107 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Factory.h" +#include "Allocator.h" + +namespace physx +{ +class PxCudaContextManager; +} + +namespace physx +{ + +namespace cloth +{ + +class CuFabric; +class CuCloth; +template <typename> +class ClothImpl; + +class CuFactory : public UserAllocated, public Factory +{ + protected: + CuFactory& operator=(const CuFactory&); + + public: + typedef CuFabric FabricType; + typedef ClothImpl<CuCloth> ImplType; + + CuFactory(physx::PxCudaContextManager*); + virtual ~CuFactory(); + + virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths, + Range<const uint32_t> triangles); + + virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric); + + virtual Solver* createSolver(physx::PxTaskManager* taskMgr); + + virtual Cloth* clone(const Cloth& cloth); + + virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths, Range<uint32_t> triangles) const; + + virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const; + + virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const; + + virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const; + + virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const; + + virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const; + + public: + void copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const; + + public: + Vector<CuFabric*>::Type mFabrics; + + physx::PxCudaContextManager* mContextManager; + + uint32_t mNumThreadsPerBlock; + + const uint32_t mMaxThreadsPerBlock; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h new file mode 100644 index 00000000..74470bde --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h @@ -0,0 +1,51 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ +namespace cloth +{ + +struct CuPhaseConfig +{ + float mStiffness; + float mStiffnessMultiplier; + float mCompressionLimit; + float mStretchLimit; + + uint32_t mNumConstraints; + const float* mRestvalues; + const uint16_t* mIndices; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h new file mode 100644 index 00000000..57dd6731 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h @@ -0,0 +1,132 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "cudamanager/PxCudaContextManager.h" +#include "cudamanager/PxCudaMemoryManager.h" +#include "Allocator.h" +#include "CuCheckSuccess.h" +#include <cuda.h> + +namespace physx +{ + +namespace cloth +{ + +struct CuHostAllocator +{ + CuHostAllocator(physx::PxCudaContextManager* ctx = NULL, unsigned int flags = cudaHostAllocDefault) + : mDevicePtr(0), mFlags(flags), mManager(0) + { + PX_ASSERT(ctx); + + if(ctx) + mManager = ctx->getMemoryManager(); + } + + void* allocate(size_t n, const char*, int) + { + physx::PxCudaBufferPtr bufferPtr; + + PX_ASSERT(mManager); + + if(mFlags & cudaHostAllocWriteCombined) + bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, n, + PX_ALLOC_INFO("cloth::CuHostAllocator::T_WRITE_COMBINED", CLOTH)); + else if(mFlags & cudaHostAllocMapped) + bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, n, + PX_ALLOC_INFO("cloth::CuHostAllocator::T_PINNED_HOST", CLOTH)); + else + bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_HOST, n, + PX_ALLOC_INFO("cloth::CuHostAllocator::T_HOST", CLOTH)); + + if(mFlags & cudaHostAllocMapped) + checkSuccess(cuMemHostGetDevicePointer(&mDevicePtr, reinterpret_cast<void*>(bufferPtr), 0)); + + return reinterpret_cast<void*>(bufferPtr); + } + + void deallocate(void* p) + { + PX_ASSERT(mManager); + + if(mFlags & cudaHostAllocWriteCombined) + mManager->free(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, physx::PxCudaBufferPtr(p)); + else if(mFlags & cudaHostAllocMapped) + mManager->free(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, physx::PxCudaBufferPtr(p)); + else + mManager->free(physx::PxCudaBufferMemorySpace::T_HOST, physx::PxCudaBufferPtr(p)); + + // don't reset mDevicePtr because Array::recreate deallocates last + } + + CUdeviceptr mDevicePtr; // device pointer of last allocation + unsigned int mFlags; + physx::PxCudaMemoryManager* mManager; +}; + +template <typename T> +CuHostAllocator getMappedAllocator(physx::PxCudaContextManager* ctx) +{ + return CuHostAllocator(ctx, cudaHostAllocMapped | cudaHostAllocWriteCombined); +} + +template <typename T> +struct CuPinnedVector +{ + // note: always use shdfnd::swap() instead of Array::swap() + // in order to keep cached device pointer consistent + typedef shdfnd::Array<T, typename physx::cloth::CuHostAllocator> Type; +}; + +template <typename T> +T* getDevicePointer(shdfnd::Array<T, typename physx::cloth::CuHostAllocator>& vector) +{ + // cached device pointer only valid if non-empty + return vector.empty() ? 0 : reinterpret_cast<T*>(vector.getAllocator().mDevicePtr); +} + +} // namespace cloth + +} // namespace physx + +namespace physx +{ +namespace shdfnd +{ +template <typename T> +void swap(Array<T, typename physx::cloth::CuHostAllocator>& left, Array<T, typename physx::cloth::CuHostAllocator>& right) +{ + swap(left.getAllocator(), right.getAllocator()); + left.swap(right); +} +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h new file mode 100644 index 00000000..fb0fd7af --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h @@ -0,0 +1,472 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifndef CU_SOLVER_KERNEL_CU +#error include CuSelfCollision.h only from CuSolverKernel.cu +#endif + +#ifndef UINT16_MAX +#define UINT16_MAX 0xffff +#endif + +namespace +{ +#if __CUDA_ARCH__ >= 300 +template <int> +__device__ void scanWarp(Pointer<Shared, int32_t> counts) +{ + asm volatile("{" + " .reg .s32 tmp;" + " .reg .pred p;" + " shfl.up.b32 tmp|p, %0, 0x01, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x02, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x04, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x08, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x10, 0x0;" + "@p add.s32 %0, tmp, %0;" + "}" + : "+r"(*generic(counts)) + :); +} +#else +template <int stride> +__device__ void scanWarp(Pointer<Shared, int32_t> counts) +{ + volatile int32_t* ptr = generic(counts); + const int32_t laneIdx = threadIdx.x & warpSize - 1; + if(laneIdx >= 1) + *ptr += ptr[-stride]; + if(laneIdx >= 2) + *ptr += ptr[-2 * stride]; + if(laneIdx >= 4) + *ptr += ptr[-4 * stride]; + if(laneIdx >= 8) + *ptr += ptr[-8 * stride]; + if(laneIdx >= 16) + *ptr += ptr[-16 * stride]; +} +#endif + +// sorts array by upper 16bits +// [keys] must be at least 2*n in length, in/out in first n elements +// [histogram] must be at least 34*16 = 544 in length +__device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> histogram) +{ + const int32_t numWarps = blockDim.x >> 5; + const int32_t warpIdx = threadIdx.x >> 5; + const int32_t laneIdx = threadIdx.x & warpSize - 1; + + const uint32_t laneMask = (1u << laneIdx) - 1; + const uint32_t mask1 = (threadIdx.x & 1) - 1; + const uint32_t mask2 = !!(threadIdx.x & 2) - 1; + const uint32_t mask4 = !!(threadIdx.x & 4) - 1; + const uint32_t mask8 = !!(threadIdx.x & 8) - 1; + + const int32_t tn = (n + blockDim.x - 1) / blockDim.x; + const int32_t startIndex = tn * (threadIdx.x - laneIdx) + laneIdx; + const int32_t endIndex = min(startIndex + tn * warpSize, n + 31 & ~31); // full warps for ballot + + int32_t* srcKeys = keys; + int32_t* dstKeys = keys + n; + + Pointer<Shared, int32_t> hIt = histogram + 16 * warpIdx; + Pointer<Shared, int32_t> pIt = histogram + 16 * laneIdx + 16; + Pointer<Shared, int32_t> tIt = histogram + 16 * numWarps + laneIdx; + + for(int32_t p = 16; p < 32; p += 4) // radix passes (4 bits each) + { + // gather bucket histograms per warp + int32_t warpCount = 0; + for(int32_t i = startIndex; i < endIndex; i += 32) + { + int32_t key = i < n ? srcKeys[i] >> p : 15; + uint32_t ballot1 = __ballot(key & 1); + uint32_t ballot2 = __ballot(key & 2); + uint32_t ballot4 = __ballot(key & 4); + uint32_t ballot8 = __ballot(key & 8); + warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8)); + } + + if(laneIdx >= 16) + hIt[laneIdx] = warpCount; + + __syncthreads(); + + // prefix sum of histogram buckets + for(int32_t i = warpIdx; i < 16; i += numWarps) + scanWarp<16>(pIt + i); + + __syncthreads(); + + // prefix sum of bucket totals (exclusive) + if(threadIdx.x < 16) + { + *tIt = tIt[-1] & !threadIdx.x - 1; + scanWarp<1>(tIt); + hIt[threadIdx.x] = 0; + } + + __syncthreads(); + + if(laneIdx < 16) + hIt[laneIdx] += *tIt; + + // split indices + for(int32_t i = startIndex; i < endIndex; i += 32) + { + int32_t key = i < n ? srcKeys[i] >> p : 15; + uint32_t ballot1 = __ballot(key & 1); + uint32_t ballot2 = __ballot(key & 2); + uint32_t ballot4 = __ballot(key & 4); + uint32_t ballot8 = __ballot(key & 8); + uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) & + (!!(key & 8) - 1 ^ ballot8); + int32_t index = hIt[key & 15] + __popc(bits & laneMask); + + if(i < n) + dstKeys[index] = srcKeys[i]; + + if(laneIdx < 16) + hIt[laneIdx] += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8)); + } + + __syncthreads(); + + ::swap(srcKeys, dstKeys); + } + +#ifndef NDEBUG + for(int32_t i = threadIdx.x; i < n; i += blockDim.x) + assert(!i || keys[i - 1] >> 16 <= keys[i] >> 16); +#endif +} +} + +namespace +{ +struct CuSelfCollision +{ + template <typename CurrentT> + __device__ void operator()(CurrentT& current); + + private: + template <typename CurrentT> + __device__ void buildAcceleration(const CurrentT& current); + template <bool useRestPositions, typename CurrentT> + __device__ void collideParticles(CurrentT& current) const; + + public: + float mPosBias[3]; + float mPosScale[3]; + const float* mPosPtr[3]; +}; +} + +__shared__ uninitialized<CuSelfCollision> gSelfCollideParticles; + +template <typename CurrentT> +__device__ void CuSelfCollision::operator()(CurrentT& current) +{ + if(min(gClothData.mSelfCollisionDistance, gFrameData.mSelfCollisionStiffness) <= 0.0f) + return; + + if(threadIdx.x < 3) + { + float upper = gFrameData.mParticleBounds[threadIdx.x * 2]; + float negativeLower = gFrameData.mParticleBounds[threadIdx.x * 2 + 1]; + + // expand bounds + float eps = (upper + negativeLower) * 1e-4f; + float expandedUpper = upper + eps; + float expandedNegativeLower = negativeLower + eps; + float expandedEdgeLength = expandedUpper + expandedNegativeLower; + + float* edgeLength = mPosBias; // use as temp + edgeLength[threadIdx.x] = expandedEdgeLength; + + __threadfence_block(); + + // calculate shortest axis + int32_t shortestAxis = edgeLength[0] > edgeLength[1]; + if(edgeLength[shortestAxis] > edgeLength[2]) + shortestAxis = 2; + + uint32_t writeAxis = threadIdx.x - shortestAxis; + writeAxis += writeAxis >> 30; + + float maxInvCellSize = __fdividef(127.0f, expandedEdgeLength); + float invCollisionDistance = __fdividef(1.0f, gClothData.mSelfCollisionDistance); + float invCellSize = min(maxInvCellSize, invCollisionDistance); + + mPosScale[writeAxis] = invCellSize; + mPosBias[writeAxis] = invCellSize * expandedNegativeLower; + mPosPtr[writeAxis] = generic(current[threadIdx.x]); + } + + __syncthreads(); + + buildAcceleration(current); + + if(gFrameData.mRestPositions) + collideParticles<true>(current); + else + collideParticles<false>(current); +} + +template <typename CurrentT> +__device__ void CuSelfCollision::buildAcceleration(const CurrentT& current) +{ + int32_t numIndices = gClothData.mNumSelfCollisionIndices; + const int32_t* indices = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionIndices); + int32_t* sortedKeys = reinterpret_cast<int32_t*>(gClothData.mSelfCollisionKeys); + int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart); + + typedef typename CurrentT::ConstPointerType ConstPointerType; + ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]); + ConstPointerType colPtr = ConstPointerType(mPosPtr[2]); + + float rowScale = mPosScale[1], rowBias = mPosBias[1]; + float colScale = mPosScale[2], colBias = mPosBias[2]; + + // calculate keys + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t index = indices ? indices[i] : i; + assert(index < gClothData.mNumParticles); + + int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[index] * rowScale + rowBias, 127.5f))); + int32_t colIndex = int32_t(max(0.0f, min(colPtr[index] * colScale + colBias, 127.5f))); + assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128); + + int32_t key = (colIndex << 7 | rowIndex) + 129; // + row and column sentinel + assert(key <= 0x4080); + + sortedKeys[i] = key << 16 | index; // (key, index) pair in a single int32_t + } + __syncthreads(); + + // get scratch shared mem buffer used for radix sort(histogram) + Pointer<Shared, int32_t> buffer = + reinterpret_cast<Pointer<Shared, int32_t> const&>(gCollideParticles.get().mCurData.mSphereX); + + // sort keys (__synchthreads inside radix sort) + radixSort(sortedKeys, numIndices, buffer); + + // mark cell start if keys are different between neighboring threads + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t key = sortedKeys[i] >> 16; + int32_t prevKey = i ? sortedKeys[i - 1] >> 16 : key - 1; + if(key != prevKey) + { + cellStart[key] = i; + cellStart[prevKey + 1] = i; + } + } + __syncthreads(); +} + +template <bool useRestPositions, typename CurrentT> +__device__ void CuSelfCollision::collideParticles(CurrentT& current) const +{ + const int32_t* sortedKeys = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionKeys); + float* sortedParticles = gClothData.mSelfCollisionParticles; + int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart); + + const float cdist = gClothData.mSelfCollisionDistance; + const float cdistSq = cdist * cdist; + + const int32_t numIndices = gClothData.mNumSelfCollisionIndices; + const int32_t numParticles = gClothData.mNumParticles; + + // point to particle copied in device memory that is being updated + float* xPtr = sortedParticles; + float* yPtr = sortedParticles + numParticles; + float* zPtr = sortedParticles + 2 * numParticles; + float* wPtr = sortedParticles + 3 * numParticles; + + // copy current particles to temporary array + for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x) + { + xPtr[i] = current(i, 0); + yPtr[i] = current(i, 1); + zPtr[i] = current(i, 2); + wPtr[i] = current(i, 3); + } + __syncthreads(); + + // copy only sorted (indexed) particles to shared mem + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t index = sortedKeys[i] & UINT16_MAX; + current(i, 0) = xPtr[index]; + current(i, 1) = yPtr[index]; + current(i, 2) = zPtr[index]; + current(i, 3) = wPtr[index]; + } + __syncthreads(); + + typedef typename CurrentT::ConstPointerType ConstPointerType; + ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]); + ConstPointerType colPtr = ConstPointerType(mPosPtr[2]); + + float rowScale = mPosScale[1], rowBias = mPosBias[1]; + float colScale = mPosScale[2], colBias = mPosBias[2]; + + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + const int32_t index = sortedKeys[i] & UINT16_MAX; + assert(index < gClothData.mNumParticles); + + float restX, restY, restZ; + if(useRestPositions) + { + const float* restIt = gFrameData.mRestPositions + index * 4; + restX = restIt[0]; + restY = restIt[1]; + restZ = restIt[2]; + } + + float posX = current(i, 0); + float posY = current(i, 1); + float posZ = current(i, 2); + float posW = current(i, 3); + + float deltaX = 0.0f; + float deltaY = 0.0f; + float deltaZ = 0.0f; + float deltaW = FLT_EPSILON; + + // get cell index for this particle + int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[i] * rowScale + rowBias, 127.5f))); + int32_t colIndex = int32_t(max(0.0f, min(colPtr[i] * colScale + colBias, 127.5f))); + assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128); + + int32_t key = colIndex << 7 | rowIndex; + assert(key <= 0x4080); + + // check cells in 3 columns + for(int32_t keyEnd = key + 256; key <= keyEnd; key += 128) + { + // cellStart keys of unoccupied cells have a value of -1 + uint32_t startIndex; // min<unsigned>(cellStart[key+0..2]) + uint32_t endIndex; // max<signed>(0, cellStart[key+1..3]) + + asm volatile("{\n\t" + " .reg .u32 start1, start2;\n\t" + " ld.global.s16 %1, [%2+6];\n\t" + " ld.global.s16 %0, [%2+0];\n\t" + " ld.global.s16 start1, [%2+2];\n\t" + " ld.global.s16 start2, [%2+4];\n\t" + " max.s32 %1, %1, 0;\n\t" + " min.u32 %0, %0, start1;\n\t" + " max.s32 %1, %1, start1;\n\t" + " min.u32 %0, %0, start2;\n\t" + " max.s32 %1, %1, start2;\n\t" + "}\n\t" + : "=r"(startIndex), "=r"(endIndex) + : POINTER_CONSTRAINT(cellStart + key)); + + // comparison must be unsigned to skip cells with negative startIndex + for(uint32_t j = startIndex; j < endIndex; ++j) + { + if(j != i) // avoid same particle + { + float dx = posX - current(j, 0); + float dy = posY - current(j, 1); + float dz = posZ - current(j, 2); + + float distSqr = dx * dx + dy * dy + dz * dz; + if(distSqr > cdistSq) + continue; + + if(useRestPositions) + { + const int32_t jndex = sortedKeys[j] & UINT16_MAX; + assert(jndex < gClothData.mNumParticles); + + // calculate distance in rest configuration + const float* restJt = gFrameData.mRestPositions + jndex * 4; + float rx = restX - restJt[0]; + float ry = restY - restJt[1]; + float rz = restZ - restJt[2]; + + if(rx * rx + ry * ry + rz * rz <= cdistSq) + continue; + } + + // premultiply ratio for weighted average + float ratio = fmaxf(0.0f, cdist * rsqrtf(FLT_EPSILON + distSqr) - 1.0f); + float scale = __fdividef(ratio * ratio, FLT_EPSILON + posW + current(j, 3)); + + deltaX += scale * dx; + deltaY += scale * dy; + deltaZ += scale * dz; + deltaW += ratio; + } + } + } + + const float stiffness = gFrameData.mSelfCollisionStiffness * posW; + float scale = __fdividef(stiffness, deltaW); + + // apply collision impulse + xPtr[index] += deltaX * scale; + yPtr[index] += deltaY * scale; + zPtr[index] += deltaZ * scale; + + assert(!isnan(xPtr[index] + yPtr[index] + zPtr[index])); + } + __syncthreads(); + + // copy temporary particle array back to shared mem + // (need to copy whole array) + for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x) + { + current(i, 0) = xPtr[i]; + current(i, 1) = yPtr[i]; + current(i, 2) = zPtr[i]; + current(i, 3) = wPtr[i]; + } + + // unmark occupied cells to empty again (faster than clearing all the cells) + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t key = sortedKeys[i] >> 16; + cellStart[key] = 0xffff; + cellStart[key + 1] = 0xffff; + } + __syncthreads(); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp new file mode 100644 index 00000000..68238664 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp @@ -0,0 +1,556 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxProfiler.h" +#include "CuSolver.h" +#include "CuCloth.h" +#include "ClothImpl.h" +#include "CuFabric.h" +#include "CuFactory.h" +#include "CuSolverKernel.h" +#include "CuContextLock.h" +#include "CuCheckSuccess.h" +#include "IterationState.h" +#include "CudaKernelWrangler.h" +#include "PsUtilities.h" +#include "PsSort.h" +#include "PsFoundation.h" + +#if PX_NVTX +#include "nvToolsExt.h" +#endif + +//#define ENABLE_CUDA_PRINTF PX_DEBUG // warning: not thread safe +#define ENABLE_CUDA_PRINTF 0 + +#if ENABLE_CUDA_PRINTF +extern "C" cudaError_t cudaPrintfInit(CUmodule hmod, size_t bufferLen = 1048576); +extern "C" void cudaPrintfEnd(); +extern "C" cudaError_t cudaPrintfDisplay(CUmodule hmod, void* outputFP = NULL, bool showThreadID = false); +#endif + +using namespace physx; + +namespace +{ +//for KernelWrangler interface +const char* gKernelName = cloth::getKernelFunctionName(); +} + +namespace +{ +template <typename T> +struct CuDeviceAllocator +{ + CuDeviceAllocator(physx::PxCudaContextManager* ctx) : mManager(ctx->getMemoryManager()) + { + } + + T* allocate(size_t n) + { + return reinterpret_cast<T*>(mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, n * sizeof(T))); + } + + void deallocate(T* ptr) + { + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, reinterpret_cast<physx::PxCudaBufferPtr>(ptr)); + } + + physx::PxCudaMemoryManager* mManager; +}; +} + +cloth::CuSolver::CuSolver(CuFactory& factory) +: CuContextLock(factory) +, mFactory(factory) +, mClothData(mFactory.mContextManager) +, mClothDataHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocWriteCombined)) +, mClothDataDirty(false) +, mFrameData(getMappedAllocator<CuFrameData>(mFactory.mContextManager)) +, mIterationData(getMappedAllocator<CuIterationData>(mFactory.mContextManager)) +, mIterationDataBegin(0) +, mFrameDt(0.0f) +, mSharedMemorySize(0) +, mSharedMemoryLimit(0) +, mStartSimulationTask(&CuSolver::beginFrame, "cloth.CuSolver.startSimulation") +, mKernelSimulationTask(&CuSolver::executeKernel, "cloth.CuSolver.kernelSimulation") +, mEndSimulationTask(&CuSolver::endFrame, "cloth.CuSolver.endSimulation") +, mStream(0) +, mKernelModule(0) +, mKernelFunction(0) +, mKernelSharedMemorySize(0) +, mClothIndex(CuDeviceAllocator<uint32_t>(mFactory.mContextManager).allocate(1)) +, mInterCollisionDistance(0.0f) +, mInterCollisionStiffness(1.0f) +, mInterCollisionIterations(1) +, mInterCollisionScratchMem(NULL) +, mInterCollisionScratchMemSize(0) +, mKernelWrangler(getDispatcher(), physx::shdfnd::getFoundation().getErrorCallback(), &gKernelName, 1) +, mSimulateNvtxRangeId(0) +, mCudaError(mKernelWrangler.hadError()) +{ + if(mCudaError) + { + CuContextLock::release(); + return; + } + + mStartSimulationTask.mSolver = this; + mKernelSimulationTask.mSolver = this; + mEndSimulationTask.mSolver = this; + + if(mFactory.mContextManager->getUsingConcurrentStreams()) + checkSuccess(cuStreamCreate(&mStream, 0)); + + if(1) + { + mKernelModule = mKernelWrangler.getCuModule(0); + mKernelFunction = mKernelWrangler.getCuFunction(0); + } + else + { + // load from ptx instead of embedded SASS, for iterating without recompile + checkSuccess(cuModuleLoad(&mKernelModule, "CuSolverKernel.ptx")); + checkSuccess(cuModuleGetFunction(&mKernelFunction, mKernelModule, getKernelFunctionName())); + shdfnd::getFoundation().error(PX_INFO, "Cloth kernel code loaded from CuSolverKernel.ptx"); + } + + // get amount of statically allocated shared memory + checkSuccess(cuFuncGetAttribute(&mKernelSharedMemorySize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, mKernelFunction)); + + // extract CuKernelData device pointer + size_t size = 0; + CUdeviceptr ptr = 0; + checkSuccess(cuModuleGetGlobal(&ptr, &size, mKernelModule, getKernelDataName())); + mKernelData = CuDevicePointer<CuKernelData>(reinterpret_cast<CuKernelData*>(ptr)); + + // initialize cloth index + checkSuccess(cuMemsetD32(mClothIndex.dev(), 0, 1)); + + CuContextLock::release(); +} + +cloth::CuSolver::~CuSolver() +{ + PX_ASSERT(mCloths.empty()); + + CuContextLock::acquire(); + + CuKernelData kernelData = {}; + *mKernelData = kernelData; + + CuDeviceAllocator<uint32_t>(mFactory.mContextManager).deallocate(mClothIndex.get()); + + if(mStream) + checkSuccess(cuStreamDestroy(mStream)); + + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); +} + +void cloth::CuSolver::updateKernelData() +{ + CuKernelData kernelData; + + kernelData.mClothIndex = mClothIndex.get(); + kernelData.mClothData = mClothData.begin().get(); + kernelData.mFrameData = getDevicePointer(mFrameData); + + *mKernelData = kernelData; +} + +physx::PxGpuDispatcher& cloth::CuSolver::getDispatcher() const +{ + return *mFactory.mContextManager->getGpuDispatcher(); +} + +namespace +{ +struct ClothSimCostGreater +{ + bool operator()(const cloth::CuCloth* left, const cloth::CuCloth* right) const + { + return left->mNumParticles * left->mSolverFrequency > right->mNumParticles * right->mSolverFrequency; + } +}; +} + +void cloth::CuSolver::addCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + PX_ASSERT(mCloths.find(&cuCloth) == mCloths.end()); + + mCloths.pushBack(&cuCloth); + // trigger update of mClothData array + cuCloth.notifyChanged(); + + // sort cloth instances by size + shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater()); + + CuContextLock contextLock(mFactory); + + // resize containers and update kernel data + mClothDataHostCopy.resize(mCloths.size()); + mClothData.resize(mCloths.size()); + mFrameData.resize(mCloths.size()); + updateKernelData(); +} + +void cloth::CuSolver::removeCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + ClothVector::Iterator begin = mCloths.begin(), end = mCloths.end(); + ClothVector::Iterator it = mCloths.find(&cuCloth); + + if(it == end) + return; // not found + + uint32_t index = uint32_t(it - begin); + + mCloths.remove(index); + mClothDataHostCopy.remove(index); + mClothData.resize(mCloths.size()); + mClothDataDirty = true; +} + +physx::PxBaseTask& cloth::CuSolver::simulate(float dt, physx::PxBaseTask& continuation) +{ + mFrameDt = dt; + + if(mCloths.empty() || mCudaError) + { + continuation.addReference(); + return continuation; + } + + physx::PxGpuDispatcher& disp = getDispatcher(); + mEndSimulationTask.setContinuation(&continuation); + disp.addPostLaunchDependent(mEndSimulationTask); + mKernelSimulationTask.setContinuation(&disp.getPostLaunchTask()); + disp.getPostLaunchTask().removeReference(); + disp.addPreLaunchDependent(mKernelSimulationTask); + mStartSimulationTask.setContinuation(&disp.getPreLaunchTask()); + disp.getPreLaunchTask().removeReference(); + + mEndSimulationTask.removeReference(); + mKernelSimulationTask.removeReference(); + + return mStartSimulationTask; +} + +void cloth::CuSolver::beginFrame() +{ + CuContextLock contextLock(mFactory); + + PX_PROFILE_START_CROSSTHREAD("cloth.CuSolver.simulate", 0); + + CuIterationData* iterationDataBegin = mIterationData.empty() ? 0 : &mIterationData.front(); + + mFrameData.resize(0); + mIterationData.resize(0); + + // update cloth data + ClothVector::Iterator cIt, cEnd = mCloths.end(); + CuPinnedVector<CuClothData>::Type::Iterator dIt = mClothDataHostCopy.begin(); + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++dIt) + mClothDataDirty |= (*cIt)->updateClothData(*dIt); + + if(mClothDataDirty) + { + /* find optimal number of cloths per SM */ + + // at least 192 threads per block (e.g. CuCollision::buildAcceleration) + uint32_t numSMs = (uint32_t)mFactory.mContextManager->getMultiprocessorCount(); + uint32_t maxClothsPerSM = PxMin(mFactory.mMaxThreadsPerBlock / 192, (mCloths.size() + numSMs - 1) / numSMs); + + // tuning parameters: relative performance per numSharedPositions + float weights[3] = { 0.4f, 0.8f, 1.0f }; + + // try all possible number of cloths per SM and estimate performance + float maxWeightSum = 0.0f; + uint32_t numClothsPerSM = 0; + for(uint32_t i = 1; i <= maxClothsPerSM; ++i) + { + uint32_t sharedMemoryLimit = (mFactory.mContextManager->getSharedMemPerBlock() / i) - mKernelSharedMemorySize; + + float weightSum = 0.0f; + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + uint32_t sharedMemorySize = (*cIt)->mSharedMemorySize; + uint32_t positionsSize = (*cIt)->mNumParticles * sizeof(PxVec4); + + if(sharedMemorySize > sharedMemoryLimit) + break; + + uint32_t numSharedPositions = PxMin(2u, (sharedMemoryLimit - sharedMemorySize) / positionsSize); + + weightSum += weights[numSharedPositions] * positionsSize; + } + // tuning parameter: inverse performance for running i cloths per SM + weightSum *= 2.0f + i; + + if(cIt == cEnd && weightSum > maxWeightSum) + { + maxWeightSum = weightSum; + numClothsPerSM = i; + } + } + PX_ASSERT(numClothsPerSM); + + // update block size + uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31; + + // Workaround for nvbug 1709919: theoretically, register usage should allow us to launch at least + // mFactory.mMaxThreadsPerBlock threads, because that value corresponds to __launch_bounds__(maxThreadsPerBlock). + CUdevice device = 0; + checkSuccess(cuCtxGetDevice(&device)); + int registersPerBlock = 0, kernelRegisterCount = 0; + checkSuccess(cuDeviceGetAttribute(®istersPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device)); + checkSuccess(cuFuncGetAttribute(&kernelRegisterCount, CU_FUNC_ATTRIBUTE_NUM_REGS, mKernelFunction)); + numThreadsPerBlock = PxMin(numThreadsPerBlock, uint32_t(registersPerBlock / kernelRegisterCount)); + PX_ASSERT(numThreadsPerBlock >= 192); + + if(mFactory.mNumThreadsPerBlock != numThreadsPerBlock) + { + checkSuccess( + cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1)); + } + + // remember num cloths per SM in terms of max shared memory per block + mSharedMemoryLimit = + (mFactory.mContextManager->getSharedMemPerBlock() / numClothsPerSM) - mKernelSharedMemorySize; + } + + uint32_t maxSharedMemorySize = 0; + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + CuCloth& cloth = **cIt; + + uint32_t sharedMemorySize = cloth.mSharedMemorySize; + uint32_t positionsSize = cloth.mNumParticles * sizeof(PxVec4); + + uint32_t numSharedPositions = PxMin(2u, (mSharedMemoryLimit - sharedMemorySize) / positionsSize); + + maxSharedMemorySize = PxMax(maxSharedMemorySize, sharedMemorySize + numSharedPositions * positionsSize); + + IterationStateFactory factory(cloth, mFrameDt); + IterationState<Simd4f> state = factory.create<Simd4f>(cloth); + + mFrameData.pushBack(CuFrameData(cloth, numSharedPositions, state, mIterationDataBegin + mIterationData.size())); + + while(state.mRemainingIterations) + { + mIterationData.pushBack(CuIterationData(state)); + state.update(); + } + } + mSharedMemorySize = maxSharedMemorySize; + + // add dummy element because we read past the end + mIterationData.pushBack(CuIterationData()); + + if(&mIterationData.front() != iterationDataBegin) + { + // mIterationData grew, update pointers + iterationDataBegin = getDevicePointer(mIterationData); + + ptrdiff_t diff = (char*)iterationDataBegin - (char*)mIterationDataBegin; + CuPinnedVector<CuFrameData>::Type::Iterator fIt = mFrameData.begin(), fEnd; + for(fEnd = mFrameData.end(); fIt != fEnd; ++fIt) + reinterpret_cast<const char*&>(fIt->mIterationData) += diff; + + mIterationDataBegin = iterationDataBegin; + } +} + +void cloth::CuSolver::executeKernel() +{ + CuContextLock contextLock(mFactory); + +#if ENABLE_CUDA_PRINTF + if(cudaError result = cudaPrintfInit(mKernelModule)) + { + shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__, "cudaPrintfInit() returned %u.", + result); + } +#endif + + if(mClothDataDirty) + { + PX_ASSERT(mClothDataHostCopy.size() == mClothData.size()); + size_t numBytes = mClothData.size() * sizeof(CuClothData); + checkSuccess(cuMemcpyHtoDAsync(mClothData.begin().dev(), mClothDataHostCopy.begin(), numBytes, mStream)); + mClothDataDirty = false; + } + +#if 0 + static int frame = 0; + if(++frame == 100) + record(*this); +#endif + + // launch kernel + CUresult result = cuLaunchKernel(mKernelFunction, mCloths.size(), 1, 1, mFactory.mNumThreadsPerBlock, 1, 1, + mSharedMemorySize, mStream, 0, 0); + +#if ENABLE_CUDA_PRINTF + cudaPrintfDisplay(mKernelModule); + cudaPrintfEnd(); +#endif + +#if PX_DEBUG + // in debug builds check kernel result + checkSuccess(result); + checkSuccess(cuStreamSynchronize(mStream)); +#endif + + // mark the solver as being in an error state + // all cloth instances will be migrated to software + if(result != CUDA_SUCCESS) + mCudaError = true; +} + +void cloth::CuSolver::endFrame() +{ + CuPinnedVector<CuFrameData>::Type::ConstIterator fIt = mFrameData.begin(); + ClothVector::Iterator cIt, cEnd = mCloths.end(); + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++fIt) + { + CuCloth& cloth = **cIt; + + cloth.mHostParticlesDirty = false; + cloth.mDeviceParticlesDirty = false; + + cloth.mMotionConstraints.pop(); + cloth.mMotionConstraints.mHostCopy.resize(0); + + cloth.mSeparationConstraints.pop(); + cloth.mSeparationConstraints.mHostCopy.resize(0); + + if(!cloth.mTargetCollisionSpheres.empty()) + { + shdfnd::swap(cloth.mStartCollisionSpheres, cloth.mTargetCollisionSpheres); + cloth.mTargetCollisionSpheres.resize(0); + } + + if(!cloth.mTargetCollisionPlanes.empty()) + { + shdfnd::swap(cloth.mStartCollisionPlanes, cloth.mTargetCollisionPlanes); + cloth.mTargetCollisionPlanes.resize(0); + } + + if(!cloth.mTargetCollisionTriangles.empty()) + { + shdfnd::swap(cloth.mStartCollisionTriangles, cloth.mTargetCollisionTriangles); + cloth.mTargetCollisionTriangles.resize(0); + } + + for(uint32_t i = 0; i < 3; ++i) + { + float upper = fIt->mParticleBounds[i * 2 + 0]; + float negativeLower = fIt->mParticleBounds[i * 2 + 1]; + cloth.mParticleBoundsCenter[i] = (upper - negativeLower) * 0.5f; + cloth.mParticleBoundsHalfExtent[i] = (upper + negativeLower) * 0.5f; + } + + cloth.mSleepPassCounter = fIt->mSleepPassCounter; + cloth.mSleepTestCounter = fIt->mSleepTestCounter; + } + + interCollision(); + + PX_PROFILE_STOP_CROSSTHREAD("cloth::CuSolver::simulate", 0); +} + +void cloth::CuSolver::interCollision() +{ + if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f) + return; + + typedef SwInterCollision<Simd4f> SwInterCollision; + + // rebuild cloth instance array + mInterCollisionInstances.resize(0); + for(uint32_t i = 0, n = mCloths.size(); i < n; ++i) + { + CuCloth& cloth = *mCloths[i]; + + float elasticity = 1.0f / mFrameData[i].mNumIterations; + PX_ASSERT(!cloth.mHostParticlesDirty); + PxVec4* particles = cloth.mParticlesHostCopy.begin(); + uint32_t* indices = NULL, numIndices = cloth.mNumParticles; + if(!cloth.mSelfCollisionIndices.empty()) + { + indices = cloth.mSelfCollisionIndicesHost.begin(); + numIndices = uint32_t(cloth.mSelfCollisionIndices.size()); + } + + mInterCollisionInstances.pushBack(SwInterCollisionData( + particles, particles + cloth.mNumParticles, numIndices, indices, cloth.mTargetMotion, + cloth.mParticleBoundsCenter, cloth.mParticleBoundsHalfExtent, elasticity, cloth.mUserData)); + + cloth.mDeviceParticlesDirty = true; + } + + uint32_t requiredTempMemorySize = uint32_t( + SwInterCollision::estimateTemporaryMemory(&mInterCollisionInstances[0], mInterCollisionInstances.size())); + + // realloc temp memory if necessary + if(mInterCollisionScratchMemSize < requiredTempMemorySize) + { + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); + + mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem"); + mInterCollisionScratchMemSize = requiredTempMemorySize; + } + + SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize); + + // run inter-collision + SwInterCollision(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), mInterCollisionDistance, + mInterCollisionStiffness, mInterCollisionIterations, mInterCollisionFilter, allocator)(); +} + +cloth::CuSolver::ClothSolverTask::ClothSolverTask(FunctionPtr functionPtr, const char* name) +: mSolver(0), mFunctionPtr(functionPtr), mName(name) +{ +} + +void cloth::CuSolver::ClothSolverTask::runInternal() +{ + (mSolver->*mFunctionPtr)(); +} + +const char* cloth::CuSolver::ClothSolverTask::getName() const +{ + return mName; +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h new file mode 100644 index 00000000..ff98d975 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h @@ -0,0 +1,180 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Solver.h" +#include "CuClothData.h" +#include "CuPinnedAllocator.h" +#include "CuContextLock.h" +#include "CuDeviceVector.h" +#include "CudaKernelWrangler.h" +#include "CmTask.h" + +#include "SwInterCollision.h" + +namespace physx +{ + +namespace cloth +{ + +class CuCloth; +class CuFabric; +struct PhaseConfig; +struct CuKernelData; + +class CuSolver : public UserAllocated, private CuContextLock, public Solver +{ +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4371) // layout of class may have changed from a previous version of the compiler due to + // better packing of member +#endif + struct ClothSolverTask : public Cm::Task + { + typedef void (CuSolver::*FunctionPtr)(); + + ClothSolverTask(FunctionPtr, const char*); + virtual void runInternal(); + virtual const char* getName() const; + + CuSolver* mSolver; + FunctionPtr mFunctionPtr; + const char* mName; + }; +#if PX_VC +#pragma warning(pop) +#endif + + PX_NOCOPY(CuSolver) + public: + CuSolver(CuFactory&); + ~CuSolver(); + + virtual void addCloth(Cloth*); + virtual void removeCloth(Cloth*); + + virtual physx::PxBaseTask& simulate(float dt, physx::PxBaseTask&); + + virtual bool hasError() const + { + return mCudaError; + } + + virtual void setInterCollisionDistance(float distance) + { + mInterCollisionDistance = distance; + } + virtual float getInterCollisionDistance() const + { + return mInterCollisionDistance; + } + virtual void setInterCollisionStiffness(float stiffness) + { + mInterCollisionStiffness = stiffness; + } + virtual float getInterCollisionStiffness() const + { + return mInterCollisionStiffness; + } + virtual void setInterCollisionNbIterations(uint32_t nbIterations) + { + mInterCollisionIterations = nbIterations; + } + virtual uint32_t getInterCollisionNbIterations() const + { + return mInterCollisionIterations; + } + virtual void setInterCollisionFilter(InterCollisionFilter filter) + { + mInterCollisionFilter = filter; + } + + private: + void updateKernelData(); // context needs to be acquired + + // simulate helper functions + void beginFrame(); + void executeKernel(); + void endFrame(); + + void interCollision(); + + physx::PxGpuDispatcher& getDispatcher() const; + + private: + CuFactory& mFactory; + + typedef Vector<CuCloth*>::Type ClothVector; + ClothVector mCloths; + + CuDeviceVector<CuClothData> mClothData; + CuPinnedVector<CuClothData>::Type mClothDataHostCopy; + bool mClothDataDirty; + + CuPinnedVector<CuFrameData>::Type mFrameData; + + CuPinnedVector<CuIterationData>::Type mIterationData; + CuIterationData* mIterationDataBegin; // corresponding device ptr + + float mFrameDt; + + uint32_t mSharedMemorySize; + uint32_t mSharedMemoryLimit; + + ClothSolverTask mStartSimulationTask; + ClothSolverTask mKernelSimulationTask; + ClothSolverTask mEndSimulationTask; + + CUstream mStream; + CUmodule mKernelModule; + CUfunction mKernelFunction; + int mKernelSharedMemorySize; + CuDevicePointer<CuKernelData> mKernelData; + CuDevicePointer<uint32_t> mClothIndex; + + float mInterCollisionDistance; + float mInterCollisionStiffness; + uint32_t mInterCollisionIterations; + InterCollisionFilter mInterCollisionFilter; + void* mInterCollisionScratchMem; + uint32_t mInterCollisionScratchMemSize; + shdfnd::Array<SwInterCollisionData> mInterCollisionInstances; + + physx::KernelWrangler mKernelWrangler; + + uint64_t mSimulateNvtxRangeId; + + bool mCudaError; + + friend void record(const CuSolver&); +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h new file mode 100644 index 00000000..d6ca350f --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h @@ -0,0 +1,57 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ +namespace cloth +{ +struct CuClothData; +struct CuFrameData; + +// data of all cloth instances, one block per instance +struct CuKernelData +{ + // pointer to atomic variable + uint32_t* mClothIndex; + + // array of cloths (length determined by grid dim) + const CuClothData* mClothData; + + // frame data per cloth + CuFrameData* mFrameData; +}; + +const char* getKernelDataName(); +const char* getKernelFunctionName(); +} +} |