diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded/LowLevelCloth | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'APEX_1.4/module/clothing/embedded/LowLevelCloth')
63 files changed, 14844 insertions, 0 deletions
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h new file mode 100644 index 00000000..6f24e51f --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Range.h" +#include "PhaseConfig.h" + +struct ID3D11Buffer; + +namespace nvidia +{ +#if APEX_UE4 + namespace Cm + { + class Task; + } +#endif + +namespace cloth +{ + +class Factory; +class Fabric; +class Cloth; + +template <typename T> +struct MappedRange : public Range<T> +{ + MappedRange(T* first, T* last, const Cloth& cloth, void (Cloth::*lock)() const, void (Cloth::*unlock)() const) + : Range<T>(first, last), mCloth(cloth), mLock(lock), mUnlock(unlock) + { + } + + MappedRange(const MappedRange& other) + : Range<T>(other), mCloth(other.mCloth), mLock(other.mLock), mUnlock(other.mUnlock) + { + (mCloth.*mLock)(); + } + + ~MappedRange() + { + (mCloth.*mUnlock)(); + } + + private: + MappedRange& operator=(const MappedRange&); + + const Cloth& mCloth; + void (Cloth::*mLock)() const; + void (Cloth::*mUnlock)() const; +}; + +struct GpuParticles +{ + PxVec4* mCurrent; + PxVec4* mPrevious; + ID3D11Buffer* mBuffer; +}; + +// abstract cloth instance +class Cloth +{ + Cloth& operator=(const Cloth&); + + protected: + Cloth() + { + } + Cloth(const Cloth&) + { + } + + public: + virtual ~Cloth() + { + } + + // same as factory.clone(*this) + virtual Cloth* clone(Factory& factory) const = 0; + + virtual Fabric& getFabric() const = 0; + virtual Factory& getFactory() const = 0; + + /* particle properties */ + + virtual uint32_t getNumParticles() const = 0; + virtual void lockParticles() const = 0; + virtual void unlockParticles() const = 0; + // return particle data for current and previous frame + // setting current invMass to zero locks particle. + virtual MappedRange<PxVec4> getCurrentParticles() = 0; + virtual MappedRange<const PxVec4> getCurrentParticles() const = 0; + virtual MappedRange<PxVec4> getPreviousParticles() = 0; + virtual MappedRange<const PxVec4> getPreviousParticles() const = 0; + virtual GpuParticles getGpuParticles() = 0; + + // set position of cloth after next call to simulate() + virtual void setTranslation(const PxVec3& trans) = 0; + virtual void setRotation(const PxQuat& rot) = 0; + + // get current position of cloth + virtual const PxVec3& getTranslation() const = 0; + virtual const PxQuat& getRotation() const = 0; + + // zero inertia derived from method calls above (once) + virtual void clearInertia() = 0; + + // adjust the position of the cloth without affecting the dynamics (to call after a world origin shift, for example) + virtual void teleport(const PxVec3& delta) = 0; + + /* solver parameters */ + + // return delta time used for previous iteration + virtual float getPreviousIterationDt() const = 0; + + // gravity in global coordinates + virtual void setGravity(const PxVec3&) = 0; + virtual PxVec3 getGravity() const = 0; + + // damping of local particle velocity (1/stiffnessFrequency) + // 0 (default): velocity is unaffected, 1: velocity is zero'ed + virtual void setDamping(const PxVec3&) = 0; + virtual PxVec3 getDamping() const = 0; + + // portion of local frame velocity applied to particles + // 0 (default): particles are unaffected + // same as damping: damp global particle velocity + virtual void setLinearDrag(const PxVec3&) = 0; + virtual PxVec3 getLinearDrag() const = 0; + virtual void setAngularDrag(const PxVec3&) = 0; + virtual PxVec3 getAngularDrag() const = 0; + + // portion of local frame accelerations applied to particles + // 0: particles are unaffected, 1 (default): physically correct + virtual void setLinearInertia(const PxVec3&) = 0; + virtual PxVec3 getLinearInertia() const = 0; + virtual void setAngularInertia(const PxVec3&) = 0; + virtual PxVec3 getAngularInertia() const = 0; + virtual void setCentrifugalInertia(const PxVec3&) = 0; + virtual PxVec3 getCentrifugalInertia() const = 0; + + // target solver iterations per second + virtual void setSolverFrequency(float) = 0; + virtual float getSolverFrequency() const = 0; + + // damp, drag, stiffness exponent per second + virtual void setStiffnessFrequency(float) = 0; + virtual float getStiffnessFrequency() const = 0; + + // filter width for averaging dt^2 factor of gravity and + // external acceleration, in numbers of iterations (default=30). + virtual void setAcceleationFilterWidth(uint32_t) = 0; + virtual uint32_t getAccelerationFilterWidth() const = 0; + + // setup edge constraint solver iteration + virtual void setPhaseConfig(Range<const PhaseConfig> configs) = 0; + + /* collision parameters */ + + virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last) = 0; + virtual uint32_t getNumSpheres() const = 0; + + virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last) = 0; + virtual uint32_t getNumCapsules() const = 0; + + virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last) = 0; + virtual uint32_t getNumPlanes() const = 0; + + virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last) = 0; + virtual uint32_t getNumConvexes() const = 0; + + virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last) = 0; + virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first) = 0; + virtual uint32_t getNumTriangles() const = 0; + + // check if we use ccd or not + virtual bool isContinuousCollisionEnabled() const = 0; + // set if we use ccd or not (disabled by default) + virtual void enableContinuousCollision(bool) = 0; + + // controls how quickly mass is increased during collisions + virtual float getCollisionMassScale() const = 0; + virtual void setCollisionMassScale(float) = 0; + + // friction + virtual void setFriction(float) = 0; + virtual float getFriction() const = 0; + + // set virtual particles for collision handling. + // each indices element consists of 3 particle + // indices and an index into the lerp weights array. + virtual void setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights) = 0; + virtual uint32_t getNumVirtualParticles() const = 0; + virtual uint32_t getNumVirtualParticleWeights() const = 0; + + /* tether constraint parameters */ + + virtual void setTetherConstraintScale(float scale) = 0; + virtual float getTetherConstraintScale() const = 0; + virtual void setTetherConstraintStiffness(float stiffness) = 0; + virtual float getTetherConstraintStiffness() const = 0; + + /* motion constraint parameters */ + + // return reference to motion constraints (position, radius) + // The entire range must be written after calling this function. + virtual Range<PxVec4> getMotionConstraints() = 0; + virtual void clearMotionConstraints() = 0; + virtual uint32_t getNumMotionConstraints() const = 0; + virtual void setMotionConstraintScaleBias(float scale, float bias) = 0; + virtual float getMotionConstraintScale() const = 0; + virtual float getMotionConstraintBias() const = 0; + virtual void setMotionConstraintStiffness(float stiffness) = 0; + virtual float getMotionConstraintStiffness() const = 0; + + /* separation constraint parameters */ + + // return reference to separation constraints (position, radius) + // The entire range must be written after calling this function. + virtual Range<PxVec4> getSeparationConstraints() = 0; + virtual void clearSeparationConstraints() = 0; + virtual uint32_t getNumSeparationConstraints() const = 0; + + /* clear interpolation */ + + // assign current to previous positions for + // collision spheres, motion, and separation constraints + virtual void clearInterpolation() = 0; + + /* particle acceleration parameters */ + + // return reference to particle accelerations (in local coordinates) + // The entire range must be written after calling this function. + virtual Range<PxVec4> getParticleAccelerations() = 0; + virtual void clearParticleAccelerations() = 0; + virtual uint32_t getNumParticleAccelerations() const = 0; + + /* self collision */ + + virtual void setSelfCollisionDistance(float distance) = 0; + virtual float getSelfCollisionDistance() const = 0; + virtual void setSelfCollisionStiffness(float stiffness) = 0; + virtual float getSelfCollisionStiffness() const = 0; + + virtual void setSelfCollisionIndices(Range<const uint32_t>) = 0; + virtual uint32_t getNumSelfCollisionIndices() const = 0; + + /* rest positions */ + + // set rest particle positions used during self-collision + virtual void setRestPositions(Range<const PxVec4>) = 0; + virtual uint32_t getNumRestPositions() const = 0; + + /* bounding box */ + + // current particle position bounds in local space + virtual const PxVec3& getBoundingBoxCenter() const = 0; + virtual const PxVec3& getBoundingBoxScale() const = 0; + + /* sleeping (disabled by default) */ + + // max particle velocity (per axis) to pass sleep test + virtual void setSleepThreshold(float) = 0; + virtual float getSleepThreshold() const = 0; + // test sleep condition every nth millisecond + virtual void setSleepTestInterval(uint32_t) = 0; + virtual uint32_t getSleepTestInterval() const = 0; + // put cloth to sleep when n consecutive sleep tests pass + virtual void setSleepAfterCount(uint32_t) = 0; + virtual uint32_t getSleepAfterCount() const = 0; + virtual uint32_t getSleepPassCount() const = 0; + virtual bool isAsleep() const = 0; + virtual void putToSleep() = 0; + virtual void wakeUp() = 0; + + virtual void setHalfPrecisionOption(bool isAllowed) = 0; + virtual bool getHalfPrecisionOption() const = 0; + +#if APEX_UE4 + virtual void simulate(float dt) = 0; +#endif + + virtual void setUserData(void*) = 0; + virtual void* getUserData() const = 0; +}; + +// wrappers to prevent non-const overload from marking particles dirty +inline MappedRange<const PxVec4> readCurrentParticles(const Cloth& cloth) +{ + return cloth.getCurrentParticles(); +} +inline MappedRange<const PxVec4> readPreviousParticles(const Cloth& cloth) +{ + return cloth.getPreviousParticles(); +} + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h new file mode 100644 index 00000000..f271b397 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "PxAssert.h" +#include "Range.h" + +namespace nvidia +{ +namespace cloth +{ + +class Factory; + +// abstract cloth constraints and triangle indices +class Fabric +{ + protected: + Fabric(const Fabric&); + Fabric& operator=(const Fabric&); + + protected: + Fabric() : mRefCount(0) + { + } + + public: + virtual ~Fabric() + { + PX_ASSERT(!mRefCount); + } + + virtual Factory& getFactory() const = 0; + + virtual uint32_t getNumPhases() const = 0; + virtual uint32_t getNumRestvalues() const = 0; + + virtual uint32_t getNumSets() const = 0; + virtual uint32_t getNumIndices() const = 0; + + virtual uint32_t getNumParticles() const = 0; + + virtual uint32_t getNumTethers() const = 0; + + virtual void scaleRestvalues(float) = 0; + virtual void scaleTetherLengths(float) = 0; + + uint16_t getRefCount() const + { + return mRefCount; + } + void incRefCount() + { + ++mRefCount; + PX_ASSERT(mRefCount > 0); + } + void decRefCount() + { + PX_ASSERT(mRefCount > 0); + --mRefCount; + } + + protected: + uint16_t mRefCount; +}; + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h new file mode 100644 index 00000000..651b3b0c --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "Range.h" + +typedef struct CUstream_st* CUstream; + +namespace physx +{ + namespace profile + { + class PxProfileZone; + } + class PxTaskManager; +} + +namespace nvidia +{ +namespace cloth +{ + +class Fabric; +class Cloth; +class Solver; +class Character; + +/// abstract factory to create context-specific simulation components +/// such as cloth, solver, collision, etc. +class Factory +{ + public: + enum Platform + { + CPU, + CUDA, + DirectCompute + }; + + protected: + Factory(Platform platform) : mPlatform(platform) + { + } + Factory(const Factory&); + Factory& operator=(const Factory&); + + public: + static Factory* createFactory(Platform, void* = 0); + + virtual ~Factory() + { + } + + Platform getPlatform() const + { + return mPlatform; + } + + /** + Create fabric data used to setup cloth object. + @param numParticles number of particles, must be larger than any particle index + @param phases map from phase to set index + @param sets inclusive prefix sum of restvalue count per set + @param restvalues array of constraint rest values + @param indices array of particle index pair per constraint + */ + virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths) = 0; + + /** + Create cloth object. + @param particles initial particle positions. + @param fabric edge distance constraint structure + */ + virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric) = 0; + + /** + Create cloth solver object. + @param profiler performance event receiver. + @param taskMgr PxTaskManager used for simulation. + */ + virtual Solver* createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr) = 0; + + /** + Create a copy of a cloth instance + @param cloth the instance to be cloned, need not match the factory type + */ + virtual Cloth* clone(const Cloth& cloth) = 0; + + /** + Extract original data from a fabric object + @param fabric to extract from, must match factory type + @param phases pre-allocated memory range to write phases + @param sets pre-allocated memory range to write sets + @param restvalues pre-allocated memory range to write restvalues + @param indices pre-allocated memory range to write indices + */ + virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths) const = 0; + + /** + Extract current collision spheres and capsules from a cloth object + @param cloth the instance to extract from, must match factory type + @param spheres pre-allocated memory range to write spheres + @param capsules pre-allocated memory range to write capsules + @param planes pre-allocated memory range to write planes + @param convexes pre-allocated memory range to write convexes + @param triangles pre-allocated memory range to write triangles + */ + virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const = 0; + + /** + Extract current motion constraints from a cloth object + @param cloth the instance to extract from, must match factory type + @param destConstraints pre-allocated memory range to write constraints + */ + virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const = 0; + + /** + Extract current separation constraints from a cloth object + @param cloth the instance to extract from, must match factory type + @param destConstraints pre-allocated memory range to write constraints + */ + virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const = 0; + + /** + Extract current particle accelerations from a cloth object + @param cloth the instance to extract from, must match factory type + @param destAccelerations pre-allocated memory range to write accelerations + */ + virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const = 0; + + /** + Extract virtual particles from a cloth object + @param cloth the instance to extract from, must match factory type + @param destIndices pre-allocated memory range to write indices + @param destWeights pre-allocated memory range to write weights + */ + virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const = 0; + + /** + Extract self collision indices from cloth object. + @param cloth the instance to extract from, must match factory type + @param destIndices pre-allocated memory range to write indices + */ + virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const = 0; + + /** + Extract particle rest positions from cloth object. + @param cloth the instance to extract from, must match factory type + @param destRestPositions pre-allocated memory range to write rest positions + */ + virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const = 0; + + protected: + const Platform mPlatform; +}; + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h new file mode 100644 index 00000000..4edf4802 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace nvidia +{ +namespace cloth +{ + +struct PhaseConfig +{ + PhaseConfig(uint16_t index = uint16_t(-1)); + + uint16_t mPhaseIndex; + uint16_t mPadding; + + // target convergence rate per iteration (1/solverFrequency) + float mStiffness; + + float mStiffnessMultiplier; + + float mCompressionLimit; + float mStretchLimit; +}; + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h new file mode 100644 index 00000000..7d48e195 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "PxAssert.h" +#include "Types.h" + +namespace nvidia +{ +namespace cloth +{ + +template <class T> +struct Range +{ + Range(); + + Range(T* first, T* last); + + template <typename S> + Range(const Range<S>& other); + + uint32_t size() const; + bool empty() const; + + void popFront(); + void popBack(); + + T* begin() const; + T* end() const; + + T& front() const; + T& back() const; + + T& operator[](uint32_t i) const; + + private: + T* mFirst; + T* mLast; // past last element +}; + +template <typename T> +Range<T>::Range() +: mFirst(0), mLast(0) +{ +} + +template <typename T> +Range<T>::Range(T* first, T* last) +: mFirst(first), mLast(last) +{ +} + +template <typename T> +template <typename S> +Range<T>::Range(const Range<S>& other) +: mFirst(other.begin()), mLast(other.end()) +{ +} + +template <typename T> +uint32_t Range<T>::size() const +{ + return uint32_t(mLast - mFirst); +} + +template <typename T> +bool Range<T>::empty() const +{ + return mFirst >= mLast; +} + +template <typename T> +void Range<T>::popFront() +{ + PX_ASSERT(mFirst < mLast); + ++mFirst; +} + +template <typename T> +void Range<T>::popBack() +{ + PX_ASSERT(mFirst < mLast); + --mLast; +} + +template <typename T> +T* Range<T>::begin() const +{ + return mFirst; +} + +template <typename T> +T* Range<T>::end() const +{ + return mLast; +} + +template <typename T> +T& Range<T>::front() const +{ + PX_ASSERT(mFirst < mLast); + return *mFirst; +} + +template <typename T> +T& Range<T>::back() const +{ + PX_ASSERT(mFirst < mLast); + return mLast[-1]; +} + +template <typename T> +T& Range<T>::operator[](uint32_t i) const +{ + PX_ASSERT(mFirst + i < mLast); + return mFirst[i]; +} + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h new file mode 100644 index 00000000..585aab63 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ + class PxBaseTask; +} + +namespace nvidia +{ +namespace cloth +{ + +class Cloth; + +// called during inter-collision, user0 and user1 are the user data from each cloth +typedef bool (*InterCollisionFilter)(void* user0, void* user1); + +/// base class for solvers +class Solver +{ + protected: + Solver(const Solver&); + Solver& operator=(const Solver&); + + protected: + Solver() + { + } + + public: + virtual ~Solver() + { + } + + /// add cloth object, returns true if successful + virtual void addCloth(Cloth*) = 0; + + /// remove cloth object + virtual void removeCloth(Cloth*) = 0; + + /// simulate one time step + virtual PxBaseTask& simulate(float dt, PxBaseTask&) = 0; + + // inter-collision parameters + virtual void setInterCollisionDistance(float distance) = 0; + virtual float getInterCollisionDistance() const = 0; + virtual void setInterCollisionStiffness(float stiffness) = 0; + virtual float getInterCollisionStiffness() const = 0; + virtual void setInterCollisionNbIterations(uint32_t nbIterations) = 0; + virtual uint32_t getInterCollisionNbIterations() const = 0; + virtual void setInterCollisionFilter(InterCollisionFilter filter) = 0; + +// virtual uint32_t getNumSharedPositions( const Cloth* ) const = 0; + + /// returns true if an unrecoverable error has occurred + virtual bool hasError() const = 0; +}; + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h new file mode 100644 index 00000000..e80a3009 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifndef __CUDACC__ +#include "ApexUsingNamespace.h" +#include "Px.h" +#include "PxVec3.h" +#include "PxVec4.h" +#include "PxQuat.h" +#endif + +// Factory.cpp gets included in both PhysXGPU and LowLevelCloth projects +// CuFactory can only be created in PhysXGPU project +// DxFactory can only be created in PhysXGPU (win) or LowLevelCloth (xbox1) +#if defined(PX_PHYSX_GPU_EXPORTS) || PX_XBOXONE +#define ENABLE_CUFACTORY ((PX_WINDOWS_FAMILY && (PX_WINRT==0)) || PX_LINUX) + +//TEMPORARY DISABLE DXFACTORY +#define ENABLE_DXFACTORY 0 +//#define ENABLE_DXFACTORY ((PX_WINDOWS_FAMILY && (PX_WINRT==0)) || PX_XBOXONE) +#else +#define ENABLE_CUFACTORY 0 +#define ENABLE_DXFACTORY 0 +#endif + +#ifndef _MSC_VER +#include <stdint.h> +#else +// typedef standard integer types +typedef unsigned __int8 uint8_t; +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; +typedef __int16 int16_t; +typedef __int32 int32_t; +#if _MSC_VER < 1600 +#define nullptr NULL +#endif +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp new file mode 100644 index 00000000..c6c297ca --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "Allocator.h" +#include "PsAllocator.h" + +namespace nvidia +{ + +void* cloth::allocate(size_t n) +{ + return n ? nvidia::getAllocator().allocate(n, "", __FILE__, __LINE__) : 0; +} + +void cloth::deallocate(void* ptr) +{ + if(ptr) + nvidia::getAllocator().deallocate(ptr); +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h new file mode 100644 index 00000000..c0488b43 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "PsArray.h" +#include "PsAllocator.h" +#include "PsAlignedMalloc.h" + +namespace nvidia +{ +namespace cloth +{ + +void* allocate(size_t); +void deallocate(void*); + +/* templated typedefs for convenience */ + +template <typename T> +struct Vector +{ + typedef nvidia::Array<T, nvidia::NonTrackingAllocator> Type; +}; + +template <typename T, size_t alignment> +struct AlignedVector +{ + typedef nvidia::Array<T, nvidia::AlignedAllocator<alignment> > Type; +}; + +struct UserAllocated +{ + virtual ~UserAllocated() + { + } + static void* operator new(size_t n) + { + return allocate(n); + } + static void operator delete(void* ptr) + { + deallocate(ptr); + } +}; + +} // namespace cloth +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h new file mode 100644 index 00000000..e9da59aa --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "PxVec4.h" +#include "PxQuat.h" +#include "PxVec3.h" +#include "ApexUsingNamespace.h" + +namespace nvidia +{ + +namespace cloth +{ + +inline float (&array(PxVec3& v))[3] +{ + return reinterpret_cast<float(&)[3]>(v); +} +inline const float (&array(const PxVec3& v))[3] +{ + return reinterpret_cast<const float(&)[3]>(v); +} +inline float (&array(PxVec4& v))[4] +{ + return reinterpret_cast<float(&)[4]>(v); +} +inline const float (&array(const PxVec4& v))[4] +{ + return reinterpret_cast<const float(&)[4]>(v); +} +inline float (&array(PxQuat& q))[4] +{ + return reinterpret_cast<float(&)[4]>(q); +} +inline const float (&array(const PxQuat& q))[4] +{ + return reinterpret_cast<const float(&)[4]>(q); +} + +} // namespace cloth + +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h new file mode 100644 index 00000000..339f6f12 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Simd4f.h" +#include <float.h> + +namespace nvidia +{ + +namespace cloth +{ + +template <typename Simd4f> +struct BoundingBox +{ + Simd4f mLower; + Simd4f mUpper; +}; + +template <typename Simd4f> +inline BoundingBox<Simd4f> loadBounds(const float* ptr) +{ + BoundingBox<Simd4f> result; + result.mLower = load(ptr); + result.mUpper = load(ptr + 3); + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> emptyBounds() +{ + BoundingBox<Simd4f> result; + + result.mLower = simd4f(FLT_MAX); + result.mUpper = -result.mLower; + + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd) +{ + BoundingBox<Simd4f> result = bounds; + for(; pIt != pEnd; ++pIt) + { + result.mLower = min(result.mLower, *pIt); + result.mUpper = max(result.mUpper, *pIt); + } + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b) +{ + BoundingBox<Simd4f> result; + result.mLower = min(a.mLower, b.mLower); + result.mUpper = max(a.mUpper, b.mUpper); + return result; +} + +template <typename Simd4f> +inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b) +{ + BoundingBox<Simd4f> result; + result.mLower = max(a.mLower, b.mLower); + result.mUpper = min(a.mUpper, b.mUpper); + return result; +} + +template <typename Simd4f> +inline bool isEmptyBounds(const BoundingBox<Simd4f>& a) +{ + return anyGreater(a.mLower, a.mUpper) != 0; +} +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h new file mode 100644 index 00000000..641fc70f --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "PsMathUtils.h" + +namespace nvidia +{ +namespace cloth +{ + +/* helper functions shared between SwCloth and CuCloth */ + +template <typename Cloth> +void initialize(Cloth& cloth, const PxVec4* pIt, const PxVec4* pEnd) +{ + // initialize particles bounding box + PxVec4 lower(FLT_MAX), upper = -lower; + for(; pIt != pEnd; ++pIt) + { + lower = lower.minimum(*pIt); + upper = upper.maximum(*pIt); + } + PxVec4 center = (upper + lower) * 0.5f; + PxVec4 extent = (upper - lower) * 0.5f; + cloth.mParticleBoundsCenter = reinterpret_cast<const PxVec3&>(center); + cloth.mParticleBoundsHalfExtent = reinterpret_cast<const PxVec3&>(extent); + + cloth.mGravity = PxVec3(0.0f); + cloth.mLogDamping = PxVec3(0.0f); + cloth.mLinearLogDrag = PxVec3(0.0f); + cloth.mAngularLogDrag = PxVec3(0.0f); + cloth.mLinearInertia = PxVec3(1.0f); + cloth.mAngularInertia = PxVec3(1.0f); + cloth.mCentrifugalInertia = PxVec3(1.0f); + cloth.mSolverFrequency = 60.0f; + cloth.mStiffnessFrequency = 10.0f; + cloth.mTargetMotion = PxTransform(PxIdentity); + cloth.mCurrentMotion = PxTransform(PxIdentity); + cloth.mLinearVelocity = PxVec3(0.0f); + cloth.mAngularVelocity = PxVec3(0.0f); + cloth.mPrevIterDt = 0.0f; + cloth.mIterDtAvg = MovingAverage(30); + cloth.mTetherConstraintLogStiffness = float(-FLT_MAX_EXP); + cloth.mTetherConstraintScale = 1.0f; + cloth.mMotionConstraintScale = 1.0f; + cloth.mMotionConstraintBias = 0.0f; + cloth.mMotionConstraintLogStiffness = float(-FLT_MAX_EXP); + cloth.mEnableContinuousCollision = false; + cloth.mCollisionMassScale = 0.0f; + cloth.mFriction = 0.0f; + cloth.mSelfCollisionDistance = 0.0f; + cloth.mSelfCollisionLogStiffness = float(-FLT_MAX_EXP); + cloth.mSleepTestInterval = uint32_t(-1); + cloth.mSleepAfterCount = uint32_t(-1); + cloth.mSleepThreshold = 0.0f; + cloth.mSleepPassCounter = 0; + cloth.mSleepTestCounter = 0; +} + +template <typename DstCloth, typename SrcCloth> +void copy(DstCloth& dstCloth, const SrcCloth& srcCloth) +{ + dstCloth.mParticleBoundsCenter = srcCloth.mParticleBoundsCenter; + dstCloth.mParticleBoundsHalfExtent = srcCloth.mParticleBoundsHalfExtent; + dstCloth.mGravity = srcCloth.mGravity; + dstCloth.mLogDamping = srcCloth.mLogDamping; + dstCloth.mLinearLogDrag = srcCloth.mLinearLogDrag; + dstCloth.mAngularLogDrag = srcCloth.mAngularLogDrag; + dstCloth.mLinearInertia = srcCloth.mLinearInertia; + dstCloth.mAngularInertia = srcCloth.mAngularInertia; + dstCloth.mCentrifugalInertia = srcCloth.mCentrifugalInertia; + dstCloth.mSolverFrequency = srcCloth.mSolverFrequency; + dstCloth.mStiffnessFrequency = srcCloth.mStiffnessFrequency; + dstCloth.mTargetMotion = srcCloth.mTargetMotion; + dstCloth.mCurrentMotion = srcCloth.mCurrentMotion; + dstCloth.mLinearVelocity = srcCloth.mLinearVelocity; + dstCloth.mAngularVelocity = srcCloth.mAngularVelocity; + dstCloth.mPrevIterDt = srcCloth.mPrevIterDt; + dstCloth.mIterDtAvg = srcCloth.mIterDtAvg; + dstCloth.mTetherConstraintLogStiffness = srcCloth.mTetherConstraintLogStiffness; + dstCloth.mTetherConstraintScale = srcCloth.mTetherConstraintScale; + dstCloth.mMotionConstraintScale = srcCloth.mMotionConstraintScale; + dstCloth.mMotionConstraintBias = srcCloth.mMotionConstraintBias; + dstCloth.mMotionConstraintLogStiffness = srcCloth.mMotionConstraintLogStiffness; + dstCloth.mEnableContinuousCollision = srcCloth.mEnableContinuousCollision; + dstCloth.mCollisionMassScale = srcCloth.mCollisionMassScale; + dstCloth.mFriction = srcCloth.mFriction; + dstCloth.mSelfCollisionDistance = srcCloth.mSelfCollisionDistance; + dstCloth.mSelfCollisionLogStiffness = srcCloth.mSelfCollisionLogStiffness; + dstCloth.mSleepTestInterval = srcCloth.mSleepTestInterval; + dstCloth.mSleepAfterCount = srcCloth.mSleepAfterCount; + dstCloth.mSleepThreshold = srcCloth.mSleepThreshold; + dstCloth.mSleepPassCounter = srcCloth.mSleepPassCounter; + dstCloth.mSleepTestCounter = srcCloth.mSleepTestCounter; + dstCloth.mIsAllowedHalfPrecisionSolver = srcCloth.mIsAllowedHalfPrecisionSolver; + dstCloth.mUserData = srcCloth.mUserData; +} + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h new file mode 100644 index 00000000..22206016 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h @@ -0,0 +1,1247 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Cloth.h" +#include "Fabric.h" +#include "Allocator.h" +#include "PsMathUtils.h" + +namespace nvidia +{ +namespace cloth +{ + +// SwCloth or CuCloth aggregate implementing the Cloth interface +// Member specializations are implemented in Sw/CuCloth.cpp +template <typename T> +class ClothImpl : public UserAllocated, public Cloth +{ + ClothImpl(const ClothImpl&); + + public: + ClothImpl& operator=(const ClothImpl&); + + typedef T ClothType; + typedef typename ClothType::FactoryType FactoryType; + typedef typename ClothType::FabricType FabricType; + typedef typename ClothType::ContextLockType ContextLockType; + + ClothImpl(Factory&, Fabric&, Range<const PxVec4>); + ClothImpl(Factory&, const ClothImpl&); + + virtual Cloth* clone(Factory& factory) const; + + virtual Fabric& getFabric() const; + virtual Factory& getFactory() const; + + virtual uint32_t getNumParticles() const; + virtual void lockParticles() const; + virtual void unlockParticles() const; + virtual MappedRange<PxVec4> getCurrentParticles(); + virtual MappedRange<const PxVec4> getCurrentParticles() const; + virtual MappedRange<PxVec4> getPreviousParticles(); + virtual MappedRange<const PxVec4> getPreviousParticles() const; + virtual GpuParticles getGpuParticles(); + + virtual void setTranslation(const PxVec3& trans); + virtual void setRotation(const PxQuat& rot); + + virtual const PxVec3& getTranslation() const; + virtual const PxQuat& getRotation() const; + + virtual void clearInertia(); + + virtual void teleport(const PxVec3& delta); + + virtual float getPreviousIterationDt() const; + virtual void setGravity(const PxVec3& gravity); + virtual PxVec3 getGravity() const; + virtual void setDamping(const PxVec3& damping); + virtual PxVec3 getDamping() const; + virtual void setLinearDrag(const PxVec3& drag); + virtual PxVec3 getLinearDrag() const; + virtual void setAngularDrag(const PxVec3& drag); + virtual PxVec3 getAngularDrag() const; + virtual void setLinearInertia(const PxVec3& inertia); + virtual PxVec3 getLinearInertia() const; + virtual void setAngularInertia(const PxVec3& inertia); + virtual PxVec3 getAngularInertia() const; + virtual void setCentrifugalInertia(const PxVec3& inertia); + virtual PxVec3 getCentrifugalInertia() const; + + virtual void setSolverFrequency(float frequency); + virtual float getSolverFrequency() const; + + virtual void setStiffnessFrequency(float frequency); + virtual float getStiffnessFrequency() const; + + virtual void setAcceleationFilterWidth(uint32_t); + virtual uint32_t getAccelerationFilterWidth() const; + + virtual void setPhaseConfig(Range<const PhaseConfig> configs); + + virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last); + virtual uint32_t getNumSpheres() const; + + virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last); + virtual uint32_t getNumCapsules() const; + + virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last); + virtual uint32_t getNumPlanes() const; + + virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last); + virtual uint32_t getNumConvexes() const; + + virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last); + virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first); + virtual uint32_t getNumTriangles() const; + + virtual bool isContinuousCollisionEnabled() const; + virtual void enableContinuousCollision(bool); + + virtual float getCollisionMassScale() const; + virtual void setCollisionMassScale(float); + virtual void setFriction(float friction); + virtual float getFriction() const; + + virtual void setVirtualParticles(Range<const uint32_t[4]>, Range<const PxVec3>); + virtual uint32_t getNumVirtualParticles() const; + virtual uint32_t getNumVirtualParticleWeights() const; + + virtual void setTetherConstraintScale(float scale); + virtual float getTetherConstraintScale() const; + virtual void setTetherConstraintStiffness(float stiffness); + virtual float getTetherConstraintStiffness() const; + + virtual Range<PxVec4> getMotionConstraints(); + virtual void clearMotionConstraints(); + virtual uint32_t getNumMotionConstraints() const; + virtual void setMotionConstraintScaleBias(float scale, float bias); + virtual float getMotionConstraintScale() const; + virtual float getMotionConstraintBias() const; + virtual void setMotionConstraintStiffness(float stiffness); + virtual float getMotionConstraintStiffness() const; + + virtual Range<PxVec4> getSeparationConstraints(); + virtual void clearSeparationConstraints(); + virtual uint32_t getNumSeparationConstraints() const; + + virtual void clearInterpolation(); + + virtual Range<PxVec4> getParticleAccelerations(); + virtual void clearParticleAccelerations(); + virtual uint32_t getNumParticleAccelerations() const; + + virtual void setSelfCollisionDistance(float); + virtual float getSelfCollisionDistance() const; + virtual void setSelfCollisionStiffness(float); + virtual float getSelfCollisionStiffness() const; + + virtual void setSelfCollisionIndices(Range<const uint32_t>); + virtual uint32_t getNumSelfCollisionIndices() const; + + virtual void setRestPositions(Range<const PxVec4>); + virtual uint32_t getNumRestPositions() const; + + virtual const PxVec3& getBoundingBoxCenter() const; + virtual const PxVec3& getBoundingBoxScale() const; + + virtual void setSleepThreshold(float); + virtual float getSleepThreshold() const; + virtual void setSleepTestInterval(uint32_t); + virtual uint32_t getSleepTestInterval() const; + virtual void setSleepAfterCount(uint32_t); + virtual uint32_t getSleepAfterCount() const; + virtual uint32_t getSleepPassCount() const; + virtual bool isAsleep() const; + virtual void putToSleep(); + virtual void wakeUp(); + + virtual void setHalfPrecisionOption(bool isAllowed); + virtual bool getHalfPrecisionOption() const; + +#if APEX_UE4 + virtual void simulate(float dt); +#endif + + virtual void setUserData(void*); + virtual void* getUserData() const; + + // helper function + template <typename U> + MappedRange<U> getMappedParticles(U* data) const; + + ClothType mCloth; +}; + +class SwCloth; +typedef ClothImpl<SwCloth> SwClothImpl; + +class CuCloth; +typedef ClothImpl<CuCloth> CuClothImpl; + +class DxCloth; +typedef ClothImpl<DxCloth> DxClothImpl; + +template <typename T> +ClothImpl<T>::ClothImpl(Factory& factory, Fabric& fabric, Range<const PxVec4> particles) +: mCloth(static_cast<FactoryType&>(factory), static_cast<FabricType&>(fabric), particles) +{ + // fabric and cloth need to be created by the same factory + PX_ASSERT(&fabric.getFactory() == &factory); +} + +template <typename T> +ClothImpl<T>::ClothImpl(Factory& factory, const ClothImpl& impl) +: mCloth(static_cast<FactoryType&>(factory), impl.mCloth) +{ +} + +template <typename T> +inline Fabric& ClothImpl<T>::getFabric() const +{ + return mCloth.mFabric; +} + +template <typename T> +inline Factory& ClothImpl<T>::getFactory() const +{ + return mCloth.mFactory; +} + +template <typename T> +inline void ClothImpl<T>::setTranslation(const PxVec3& trans) +{ + PxVec3 t = reinterpret_cast<const PxVec3&>(trans); + if(t == mCloth.mTargetMotion.p) + return; + + mCloth.mTargetMotion.p = t; + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::setRotation(const PxQuat& q) +{ + if((q - mCloth.mTargetMotion.q).magnitudeSquared() == 0.0f) + return; + + mCloth.mTargetMotion.q = q; + mCloth.wakeUp(); +} + +template <typename T> +inline const PxVec3& ClothImpl<T>::getTranslation() const +{ + return mCloth.mTargetMotion.p; +} + +template <typename T> +inline const PxQuat& ClothImpl<T>::getRotation() const +{ + return mCloth.mTargetMotion.q; +} + +template <typename T> +inline void ClothImpl<T>::clearInertia() +{ + mCloth.mCurrentMotion = mCloth.mTargetMotion; + mCloth.mLinearVelocity = PxVec3(0.0f); + mCloth.mAngularVelocity = PxVec3(0.0f); + + mCloth.wakeUp(); +} + +// Fixed 4505:local function has been removed +template <typename T> +inline void ClothImpl<T>::teleport(const PxVec3& delta) +{ + mCloth.mCurrentMotion.p += delta; + mCloth.mTargetMotion.p += delta; +} + +template <typename T> +inline float ClothImpl<T>::getPreviousIterationDt() const +{ + return mCloth.mPrevIterDt; +} + +template <typename T> +inline void ClothImpl<T>::setGravity(const PxVec3& gravity) +{ + PxVec3 value = gravity; + if(value == mCloth.mGravity) + return; + + mCloth.mGravity = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getGravity() const +{ + return mCloth.mGravity; +} + +inline float safeLog2(float x) +{ + return x ? physx::shdfnd::log2(x) : -FLT_MAX_EXP; +} + +inline PxVec3 safeLog2(const PxVec3& v) +{ + return PxVec3(safeLog2(v.x), safeLog2(v.y), safeLog2(v.z)); +} + +inline float safeExp2(float x) +{ + if(x <= -FLT_MAX_EXP) + return 0.0f; + else + return physx::shdfnd::exp2(x); +} + +inline PxVec3 safeExp2(const PxVec3& v) +{ + return PxVec3(safeExp2(v.x), safeExp2(v.y), safeExp2(v.z)); +} + +template <typename T> +inline void ClothImpl<T>::setDamping(const PxVec3& damping) +{ + PxVec3 value = safeLog2(PxVec3(1.f) - damping); + if(value == mCloth.mLogDamping) + return; + + mCloth.mLogDamping = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getDamping() const +{ + return PxVec3(1.f) - safeExp2(mCloth.mLogDamping); +} + +template <typename T> +inline void ClothImpl<T>::setLinearDrag(const PxVec3& drag) +{ + PxVec3 value = safeLog2(PxVec3(1.f) - drag); + if(value == mCloth.mLinearLogDrag) + return; + + mCloth.mLinearLogDrag = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getLinearDrag() const +{ + return PxVec3(1.f) - safeExp2(mCloth.mLinearLogDrag); +} + +template <typename T> +inline void ClothImpl<T>::setAngularDrag(const PxVec3& drag) +{ + PxVec3 value = safeLog2(PxVec3(1.f) - drag); + if(value == mCloth.mAngularLogDrag) + return; + + mCloth.mAngularLogDrag = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getAngularDrag() const +{ + return PxVec3(1.f) - safeExp2(mCloth.mAngularLogDrag); +} + +template <typename T> +inline void ClothImpl<T>::setLinearInertia(const PxVec3& inertia) +{ + PxVec3 value = inertia; + if(value == mCloth.mLinearInertia) + return; + + mCloth.mLinearInertia = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getLinearInertia() const +{ + return mCloth.mLinearInertia; +} + +template <typename T> +inline void ClothImpl<T>::setAngularInertia(const PxVec3& inertia) +{ + PxVec3 value = inertia; + if(value == mCloth.mAngularInertia) + return; + + mCloth.mAngularInertia = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getAngularInertia() const +{ + return mCloth.mAngularInertia; +} + +template <typename T> +inline void ClothImpl<T>::setCentrifugalInertia(const PxVec3& inertia) +{ + PxVec3 value = inertia; + if(value == mCloth.mCentrifugalInertia) + return; + + mCloth.mCentrifugalInertia = value; + mCloth.wakeUp(); +} + +template <typename T> +inline PxVec3 ClothImpl<T>::getCentrifugalInertia() const +{ + return mCloth.mCentrifugalInertia; +} + +template <typename T> +inline void ClothImpl<T>::setSolverFrequency(float frequency) +{ + if(frequency == mCloth.mSolverFrequency) + return; + + mCloth.mSolverFrequency = frequency; + mCloth.mClothCostDirty = true; + mCloth.mIterDtAvg.reset(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSolverFrequency() const +{ + return mCloth.mSolverFrequency; +} + +template <typename T> +inline void ClothImpl<T>::setStiffnessFrequency(float frequency) +{ + if(frequency == mCloth.mStiffnessFrequency) + return; + + mCloth.mStiffnessFrequency = frequency; + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getStiffnessFrequency() const +{ + return mCloth.mStiffnessFrequency; +} + +template <typename T> +inline void ClothImpl<T>::setAcceleationFilterWidth(uint32_t n) +{ + mCloth.mIterDtAvg.resize(n); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getAccelerationFilterWidth() const +{ + return mCloth.mIterDtAvg.size(); +} + +// move a subarray +template <typename Iter> +void move(Iter it, uint32_t first, uint32_t last, uint32_t result) +{ + if(result > first) + { + result += last - first; + while(first < last) + it[--result] = it[--last]; + } + else + { + while(first < last) + it[result++] = it[first++]; + } +} + +// update capsule index +inline bool updateIndex(uint32_t& index, uint32_t first, int32_t delta) +{ + return index >= first && int32_t(index += delta) < int32_t(first); +} + +template <typename T> +inline void ClothImpl<T>::setSpheres(Range<const PxVec4> spheres, uint32_t first, uint32_t last) +{ + uint32_t oldSize = uint32_t(mCloth.mStartCollisionSpheres.size()); + uint32_t newSize = uint32_t(spheres.size()) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + +#if PX_DEBUG + for(const PxVec4* it = spheres.begin(); it < spheres.end(); ++it) + PX_ASSERT(it->w >= 0.0f); +#endif + + if(!oldSize && !newSize) + return; + + if(!oldSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionSpheres.assign(spheres.begin(), spheres.end()); + mCloth.notifyChanged(); + } + else + { + if(PxMax(oldSize, newSize) > + PxMin(mCloth.mStartCollisionSpheres.capacity(), mCloth.mTargetCollisionSpheres.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionSpheres.reserve(newSize); + mCloth.mTargetCollisionSpheres.reserve(PxMax(oldSize, newSize)); + } + + typename T::MappedVec4fVectorType start = mCloth.mStartCollisionSpheres; + typename T::MappedVec4fVectorType target = mCloth.mTargetCollisionSpheres; + + // fill target from start + for(uint32_t i = target.size(); i < oldSize; ++i) + target.pushBack(start[i]); + + // resize to larger of oldSize and newSize + start.resize(PxMax(oldSize, newSize), PxVec4(0.0f)); + target.resize(PxMax(oldSize, newSize), PxVec4(0.0f)); + + if(int32_t delta = int32_t(newSize - oldSize)) + { + // move past-range elements to new place + move(start.begin(), last, oldSize, last + delta); + move(target.begin(), last, oldSize, last + delta); + + // fill new elements from spheres + for(uint32_t i = last; i < last + delta; ++i) + start[i] = spheres[i - first]; + + // adjust capsule indices + typename T::MappedIndexVectorType indices = mCloth.mCapsuleIndices; + Vector<IndexPair>::Type::Iterator cIt, cEnd = indices.end(); + for(cIt = indices.begin(); cIt != cEnd;) + { + bool removed = false; + removed |= updateIndex(cIt->first, last + PxMin(0, delta), int32_t(delta)); + removed |= updateIndex(cIt->second, last + PxMin(0, delta), int32_t(delta)); + if(!removed) + ++cIt; + else + { + indices.replaceWithLast(cIt); + cEnd = indices.end(); + } + } + + start.resize(newSize); + target.resize(newSize); + + mCloth.notifyChanged(); + } + + // fill target elements with spheres + for(uint32_t i = 0; i < spheres.size(); ++i) + target[first + i] = spheres[i]; + } + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumSpheres() const +{ + return uint32_t(mCloth.mStartCollisionSpheres.size()); +} + +// Fixed 4505:local function has been removed +template <typename T> +inline void ClothImpl<T>::setCapsules(Range<const uint32_t> capsules, uint32_t first, uint32_t last) +{ + uint32_t oldSize = mCloth.mCapsuleIndices.size(); + uint32_t newSize = uint32_t(capsules.size() / 2) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + const IndexPair* srcIndices = reinterpret_cast<const IndexPair*>(capsules.begin()); + + if(mCloth.mCapsuleIndices.capacity() < newSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mCapsuleIndices.reserve(newSize); + } + + // resize to larger of oldSize and newSize + mCloth.mCapsuleIndices.resize(PxMax(oldSize, newSize)); + + typename T::MappedIndexVectorType dstIndices = mCloth.mCapsuleIndices; + + if(uint32_t delta = newSize - oldSize) + { + // move past-range elements to new place + move(dstIndices.begin(), last, oldSize, last + delta); + + // fill new elements from capsules + for(uint32_t i = last; i < last + delta; ++i) + dstIndices[i] = srcIndices[i - first]; + + dstIndices.resize(newSize); + mCloth.notifyChanged(); + } + + // fill existing elements from capsules + for(uint32_t i = first; i < last; ++i) + dstIndices[i] = srcIndices[i - first]; + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumCapsules() const +{ + return uint32_t(mCloth.mCapsuleIndices.size()); +} + +template <typename T> +inline void ClothImpl<T>::setPlanes(Range<const PxVec4> planes, uint32_t first, uint32_t last) +{ + uint32_t oldSize = uint32_t(mCloth.mStartCollisionPlanes.size()); + uint32_t newSize = uint32_t(planes.size()) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + if(!oldSize && !newSize) + return; + + if(!oldSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionPlanes.assign(planes.begin(), planes.end()); + mCloth.notifyChanged(); + } + else + { + if(PxMax(oldSize, newSize) > + PxMin(mCloth.mStartCollisionPlanes.capacity(), mCloth.mTargetCollisionPlanes.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionPlanes.reserve(newSize); + mCloth.mTargetCollisionPlanes.reserve(PxMax(oldSize, newSize)); + } + + // fill target from start + for(uint32_t i = mCloth.mTargetCollisionPlanes.size(); i < oldSize; ++i) + mCloth.mTargetCollisionPlanes.pushBack(mCloth.mStartCollisionPlanes[i]); + + // resize to larger of oldSize and newSize + mCloth.mStartCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero); + mCloth.mTargetCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero); + + if(int32_t delta = int32_t(newSize - oldSize)) + { + // move past-range elements to new place + move(mCloth.mStartCollisionPlanes.begin(), last, oldSize, last + delta); + move(mCloth.mTargetCollisionPlanes.begin(), last, oldSize, last + delta); + + // fill new elements from planes + for(uint32_t i = last; i < last + delta; ++i) + mCloth.mStartCollisionPlanes[i] = planes[i - first]; + + // adjust convex indices + uint32_t mask = (uint32_t(1) << (last + PxMin(delta, 0))) - 1; + Vector<uint32_t>::Type::Iterator cIt, cEnd = mCloth.mConvexMasks.end(); + for(cIt = mCloth.mConvexMasks.begin(); cIt != cEnd;) + { + uint32_t convex = (*cIt & mask); + if(delta < 0) + convex |= *cIt >> -delta & ~mask; + else + convex |= (*cIt & ~mask) << delta; + if(convex) + *cIt++ = convex; + else + { + mCloth.mConvexMasks.replaceWithLast(cIt); + cEnd = mCloth.mConvexMasks.end(); + } + } + + mCloth.mStartCollisionPlanes.resize(newSize); + mCloth.mTargetCollisionPlanes.resize(newSize); + + mCloth.notifyChanged(); + } + + // fill target elements with planes + for(uint32_t i = 0; i < planes.size(); ++i) + mCloth.mTargetCollisionPlanes[first + i] = planes[i]; + } + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumPlanes() const +{ + return uint32_t(mCloth.mStartCollisionPlanes.size()); +} + +template <typename T> +inline void ClothImpl<T>::setConvexes(Range<const uint32_t> convexes, uint32_t first, uint32_t last) +{ + uint32_t oldSize = mCloth.mConvexMasks.size(); + uint32_t newSize = uint32_t(convexes.size()) + oldSize - last + first; + + PX_ASSERT(newSize <= 32); + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + if(mCloth.mConvexMasks.capacity() < newSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mConvexMasks.reserve(newSize); + } + + // resize to larger of oldSize and newSize + mCloth.mConvexMasks.resize(PxMax(oldSize, newSize)); + + if(uint32_t delta = newSize - oldSize) + { + // move past-range elements to new place + move(mCloth.mConvexMasks.begin(), last, oldSize, last + delta); + + // fill new elements from capsules + for(uint32_t i = last; i < last + delta; ++i) + mCloth.mConvexMasks[i] = convexes[i - first]; + + mCloth.mConvexMasks.resize(newSize); + mCloth.notifyChanged(); + } + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumConvexes() const +{ + return uint32_t(mCloth.mConvexMasks.size()); +} + +template <typename T> +inline void ClothImpl<T>::setTriangles(Range<const PxVec3> triangles, uint32_t first, uint32_t last) +{ + // convert from triangle to vertex count + first *= 3; + last *= 3; + + triangles = mCloth.clampTriangleCount(triangles, last - first); + PX_ASSERT(0 == triangles.size() % 3); + + uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size()); + uint32_t newSize = uint32_t(triangles.size()) + oldSize - last + first; + + PX_ASSERT(first <= oldSize); + PX_ASSERT(last <= oldSize); + + if(!oldSize && !newSize) + return; + + if(!oldSize) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionTriangles.assign(triangles.begin(), triangles.end()); + mCloth.notifyChanged(); + } + else + { + if(PxMax(oldSize, newSize) > + PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionTriangles.reserve(newSize); + mCloth.mTargetCollisionTriangles.reserve(PxMax(oldSize, newSize)); + } + + // fill target from start + for(uint32_t i = mCloth.mTargetCollisionTriangles.size(); i < oldSize; ++i) + mCloth.mTargetCollisionTriangles.pushBack(mCloth.mStartCollisionTriangles[i]); + + // resize to larger of oldSize and newSize + mCloth.mStartCollisionTriangles.resize(PxMax(oldSize, newSize)); + mCloth.mTargetCollisionTriangles.resize(PxMax(oldSize, newSize)); + + if(uint32_t delta = newSize - oldSize) + { + // move past-range elements to new place + move(mCloth.mStartCollisionTriangles.begin(), last, oldSize, last + delta); + move(mCloth.mTargetCollisionTriangles.begin(), last, oldSize, last + delta); + + // fill new elements from triangles + for(uint32_t i = last; i < last + delta; ++i) + mCloth.mStartCollisionTriangles[i] = triangles[i - first]; + + mCloth.mStartCollisionTriangles.resize(newSize); + mCloth.mTargetCollisionTriangles.resize(newSize); + + mCloth.notifyChanged(); + } + + // fill target elements with triangles + for(uint32_t i = 0; i < triangles.size(); ++i) + mCloth.mTargetCollisionTriangles[first + i] = triangles[i]; + } + + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::setTriangles(Range<const PxVec3> startTriangles, Range<const PxVec3> targetTriangles, + uint32_t first) +{ + PX_ASSERT(startTriangles.size() == targetTriangles.size()); + + // convert from triangle to vertex count + first *= 3; + + uint32_t last = uint32_t(mCloth.mStartCollisionTriangles.size()); + + startTriangles = mCloth.clampTriangleCount(startTriangles, last - first); + targetTriangles = mCloth.clampTriangleCount(targetTriangles, last - first); + + uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size()); + uint32_t newSize = uint32_t(startTriangles.size()) + oldSize - last + first; + + PX_ASSERT(first <= oldSize); + PX_ASSERT(last == oldSize); // this path only supports replacing the tail + + if(!oldSize && !newSize) + return; + + if(newSize > PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity())) + { + ContextLockType contextLock(mCloth.mFactory); + mCloth.mStartCollisionTriangles.reserve(newSize); + mCloth.mTargetCollisionTriangles.reserve(newSize); + } + + uint32_t retainSize = oldSize - last + first; + mCloth.mStartCollisionTriangles.resize(retainSize); + mCloth.mTargetCollisionTriangles.resize(retainSize); + + for(uint32_t i = 0, n = startTriangles.size(); i < n; ++i) + { + mCloth.mStartCollisionTriangles.pushBack(startTriangles[i]); + mCloth.mTargetCollisionTriangles.pushBack(targetTriangles[i]); + } + + if(newSize - oldSize) + mCloth.notifyChanged(); + + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumTriangles() const +{ + return uint32_t(mCloth.mStartCollisionTriangles.size()) / 3; +} + +template <typename T> +inline bool ClothImpl<T>::isContinuousCollisionEnabled() const +{ + return mCloth.mEnableContinuousCollision; +} + +template <typename T> +inline void ClothImpl<T>::enableContinuousCollision(bool enable) +{ + if(enable == mCloth.mEnableContinuousCollision) + return; + + mCloth.mEnableContinuousCollision = enable; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getCollisionMassScale() const +{ + return mCloth.mCollisionMassScale; +} + +template <typename T> +inline void ClothImpl<T>::setCollisionMassScale(float scale) +{ + if(scale == mCloth.mCollisionMassScale) + return; + + mCloth.mCollisionMassScale = scale; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::setFriction(float friction) +{ + mCloth.mFriction = friction; + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getFriction() const +{ + return mCloth.mFriction; +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumVirtualParticleWeights() const +{ + return uint32_t(mCloth.mVirtualParticleWeights.size()); +} + +template <typename T> +inline void ClothImpl<T>::setTetherConstraintScale(float scale) +{ + if(scale == mCloth.mTetherConstraintScale) + return; + + mCloth.mTetherConstraintScale = scale; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getTetherConstraintScale() const +{ + return mCloth.mTetherConstraintScale; +} + +template <typename T> +inline void ClothImpl<T>::setTetherConstraintStiffness(float stiffness) +{ + float value = safeLog2(1 - stiffness); + if(value == mCloth.mTetherConstraintLogStiffness) + return; + + mCloth.mTetherConstraintLogStiffness = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getTetherConstraintStiffness() const +{ + return 1 - safeExp2(mCloth.mTetherConstraintLogStiffness); +} + +template <typename T> +inline Range<PxVec4> ClothImpl<T>::getMotionConstraints() +{ + mCloth.wakeUp(); + return mCloth.push(mCloth.mMotionConstraints); +} + +template <typename T> +inline void ClothImpl<T>::clearMotionConstraints() +{ + mCloth.clear(mCloth.mMotionConstraints); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumMotionConstraints() const +{ + return uint32_t(mCloth.mMotionConstraints.mStart.size()); +} + +template <typename T> +inline void ClothImpl<T>::setMotionConstraintScaleBias(float scale, float bias) +{ + if(scale == mCloth.mMotionConstraintScale && bias == mCloth.mMotionConstraintBias) + return; + + mCloth.mMotionConstraintScale = scale; + mCloth.mMotionConstraintBias = bias; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getMotionConstraintScale() const +{ + return mCloth.mMotionConstraintScale; +} + +template <typename T> +inline float ClothImpl<T>::getMotionConstraintBias() const +{ + return mCloth.mMotionConstraintBias; +} + +template <typename T> +inline void ClothImpl<T>::setMotionConstraintStiffness(float stiffness) +{ + float value = safeLog2(1 - stiffness); + if(value == mCloth.mMotionConstraintLogStiffness) + return; + + mCloth.mMotionConstraintLogStiffness = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getMotionConstraintStiffness() const +{ + return 1 - safeExp2(mCloth.mMotionConstraintLogStiffness); +} + +template <typename T> +inline Range<PxVec4> ClothImpl<T>::getSeparationConstraints() +{ + mCloth.wakeUp(); + return mCloth.push(mCloth.mSeparationConstraints); +} + +template <typename T> +inline void ClothImpl<T>::clearSeparationConstraints() +{ + mCloth.clear(mCloth.mSeparationConstraints); + mCloth.wakeUp(); +} + +template <typename T> +inline void ClothImpl<T>::clearInterpolation() +{ + if(!mCloth.mTargetCollisionSpheres.empty()) + { + nvidia::swap(mCloth.mStartCollisionSpheres, mCloth.mTargetCollisionSpheres); + mCloth.mTargetCollisionSpheres.resize(0); + } + mCloth.mMotionConstraints.pop(); + mCloth.mSeparationConstraints.pop(); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumSeparationConstraints() const +{ + return uint32_t(mCloth.mSeparationConstraints.mStart.size()); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumParticleAccelerations() const +{ + return uint32_t(mCloth.mParticleAccelerations.size()); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumSelfCollisionIndices() const +{ + return uint32_t(mCloth.mSelfCollisionIndices.size()); +} + +// Fixed 4505:local function has been removed +template <typename T> +inline void ClothImpl<T>::setRestPositions(Range<const PxVec4> restPositions) +{ + PX_ASSERT(restPositions.empty() || restPositions.size() == getNumParticles()); + ContextLockType contextLock(mCloth.mFactory); + mCloth.mRestPositions.assign(restPositions.begin(), restPositions.end()); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getNumRestPositions() const +{ + return uint32_t(mCloth.mRestPositions.size()); +} + +template <typename T> +inline void ClothImpl<T>::setSelfCollisionDistance(float distance) +{ + if(distance == mCloth.mSelfCollisionDistance) + return; + + mCloth.mSelfCollisionDistance = distance; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSelfCollisionDistance() const +{ + return mCloth.mSelfCollisionDistance; +} + +template <typename T> +inline void ClothImpl<T>::setSelfCollisionStiffness(float stiffness) +{ + float value = safeLog2(1 - stiffness); + if(value == mCloth.mSelfCollisionLogStiffness) + return; + + mCloth.mSelfCollisionLogStiffness = value; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSelfCollisionStiffness() const +{ + return 1 - safeExp2(mCloth.mSelfCollisionLogStiffness); +} + +template <typename T> +inline const PxVec3& ClothImpl<T>::getBoundingBoxCenter() const +{ + return mCloth.mParticleBoundsCenter; +} + +template <typename T> +inline const PxVec3& ClothImpl<T>::getBoundingBoxScale() const +{ + return mCloth.mParticleBoundsHalfExtent; +} + +template <typename T> +inline void ClothImpl<T>::setSleepThreshold(float threshold) +{ + if(threshold == mCloth.mSleepThreshold) + return; + + mCloth.mSleepThreshold = threshold; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline float ClothImpl<T>::getSleepThreshold() const +{ + return mCloth.mSleepThreshold; +} + +template <typename T> +inline void ClothImpl<T>::setSleepTestInterval(uint32_t interval) +{ + if(interval == mCloth.mSleepTestInterval) + return; + + mCloth.mSleepTestInterval = interval; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getSleepTestInterval() const +{ + return mCloth.mSleepTestInterval; +} + +template <typename T> +inline void ClothImpl<T>::setSleepAfterCount(uint32_t afterCount) +{ + if(afterCount == mCloth.mSleepAfterCount) + return; + + mCloth.mSleepAfterCount = afterCount; + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <typename T> +inline uint32_t ClothImpl<T>::getSleepAfterCount() const +{ + return mCloth.mSleepAfterCount; +} + +template <typename T> +inline uint32_t ClothImpl<T>::getSleepPassCount() const +{ + return mCloth.mSleepPassCounter; +} + +template <typename T> +inline bool ClothImpl<T>::isAsleep() const +{ + return mCloth.isSleeping(); +} + +template <typename T> +inline void ClothImpl<T>::putToSleep() +{ + mCloth.mSleepPassCounter = mCloth.mSleepAfterCount; +} + +template <typename T> +inline void ClothImpl<T>::wakeUp() +{ + mCloth.wakeUp(); +} + + +template <typename T> +inline void ClothImpl<T>::setHalfPrecisionOption(bool isAllowed) +{ + mCloth.mIsAllowedHalfPrecisionSolver = isAllowed; +} + +template <typename T> +inline bool ClothImpl<T>::getHalfPrecisionOption() const +{ + return mCloth.mIsAllowedHalfPrecisionSolver; +} + +template <typename T> +inline void ClothImpl<T>::setUserData(void* data) +{ + mCloth.mUserData = data; +} + +template <typename T> +inline void* ClothImpl<T>::getUserData() const +{ + return mCloth.mUserData; +} + +template <typename T> +template <typename U> +inline MappedRange<U> ClothImpl<T>::getMappedParticles(U* data) const +{ + return MappedRange<U>(data, data + getNumParticles(), *this, &Cloth::lockParticles, &Cloth::unlockParticles); +} + +} // namespace cloth + +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp new file mode 100644 index 00000000..6e49c85f --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwFactory.h" + +#if ENABLE_CUFACTORY +#include "CuFactory.h" +#endif + +#if ENABLE_DXFACTORY +#include "windows/DxFactory.h" +//#include "PxGraphicsContextManager.h" +#pragma warning(disable : 4668 4917 4365 4061 4005) +#if PX_XBOXONE +#include <d3d11_x.h> +#else +#include <d3d11.h> +#endif +#endif + +namespace nvidia +{ +namespace cloth +{ +uint32_t getNextFabricId() +{ + static uint32_t sNextFabricId = 0; + return sNextFabricId++; +} +} +} + +using namespace nvidia; + +cloth::Factory* cloth::Factory::createFactory(Platform platform, void* contextManager) +{ + PX_UNUSED(contextManager); + + if(platform == Factory::CPU) + return new SwFactory; + +#if ENABLE_CUFACTORY + if(platform == Factory::CUDA) + return new CuFactory((PxCudaContextManager*)contextManager); +#endif + +#if ENABLE_DXFACTORY + if(platform == Factory::DirectCompute) + { + //physx::PxGraphicsContextManager* graphicsContextManager = (physx::PxGraphicsContextManager*)contextManager; + //if(graphicsContextManager->getDevice()->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0) + // return new DxFactory(graphicsContextManager); + } +#endif + + return 0; +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h new file mode 100644 index 00000000..89dd9090 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace nvidia +{ +namespace cloth +{ + +struct IndexPair +{ + uint32_t first; + uint32_t second; +}; + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h new file mode 100644 index 00000000..527cf163 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "Array.h" +#include "PxTransform.h" +#include "PxMat44.h" +#include "PsMathUtils.h" +#include "Simd4f.h" +#include "Simd4i.h" + +namespace nvidia +{ + +/* function object to perform solver iterations on one cloth */ + +// todo: performance optimization: cache this object and test if velocity/iterDt has changed +// c'tor takes about 5% of the iteration time of a 20x20 cloth + +namespace cloth +{ + +/* helper functions */ + +inline PxVec3 log(const PxQuat& q) +{ + float theta = q.getImaginaryPart().magnitude(); + float scale = theta > PX_EPS_REAL ? PxAsin(theta) / theta : 1.0f; + scale = intrinsics::fsel(q.w, scale, -scale); + return PxVec3(q.x * scale, q.y * scale, q.z * scale); +} + +inline PxQuat exp(const PxVec3& v) +{ + float theta = v.magnitude(); + float scale = theta > PX_EPS_REAL ? PxSin(theta) / theta : 1.0f; + return PxQuat(v.x * scale, v.y * scale, v.z * scale, cos(theta)); +} + +template <typename Simd4f, uint32_t N> +inline void assign(Simd4f (&columns)[N], const PxMat44& matrix) +{ + for(uint32_t i = 0; i < N; ++i) + columns[i] = load(array(matrix[i])); +} + +template <typename Simd4f> +inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec) +{ + return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2]; +} + +template <typename Simd4f> +inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec) +{ + return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2]; +} + +template <typename> +struct IterationState; // forward declaration + +struct IterationStateFactory +{ + template <typename MyCloth> + IterationStateFactory(MyCloth& cloth, float frameDt); + + template <typename Simd4f, typename MyCloth> + IterationState<Simd4f> create(MyCloth const& cloth) const; + + template <typename Simd4f> + static Simd4f lengthSqr(Simd4f const& v) + { + return dot3(v, v); + } + + template <typename Simd4f> + static PxVec3 castToPxVec3(const Simd4f& v) + { + return *reinterpret_cast<const PxVec3*>(reinterpret_cast<const char*>(&v)); + } + + int mNumIterations; + float mInvNumIterations; + float mIterDt, mIterDtRatio, mIterDtAverage; + PxQuat mCurrentRotation; + PxVec3 mPrevLinearVelocity; + PxVec3 mPrevAngularVelocity; +}; + +/* solver iterations helper functor */ +template <typename Simd4f> +struct IterationState +{ + // call after each iteration + void update(); + + inline float getCurrentAlpha() const; + inline float getPreviousAlpha() const; + + public: + Simd4f mRotationMatrix[3]; + Simd4f mCurBias; // in local space + Simd4f mPrevBias; // in local space + + Simd4f mPrevMatrix[3]; + Simd4f mCurMatrix[3]; + Simd4f mDampScaleUpdate; + + // iteration counter + uint32_t mRemainingIterations; + + // reciprocal total number of iterations + float mInvNumIterations; + + // time step size per iteration + float mIterDt; + + bool mIsTurning; // if false, mPositionScale = mPrevMatrix[0] +}; + +} // namespace cloth + +template <typename Simd4f> +inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const +{ + return getPreviousAlpha() + mInvNumIterations; +} + +template <typename Simd4f> +inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const +{ + return 1.0f - mRemainingIterations * mInvNumIterations; +} + +template <typename MyCloth> +cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameDt) +{ + mNumIterations = PxMax(1, int(frameDt * cloth.mSolverFrequency + 0.5f)); + mInvNumIterations = 1.0f / mNumIterations; + mIterDt = frameDt * mInvNumIterations; + + mIterDtRatio = cloth.mPrevIterDt ? mIterDt / cloth.mPrevIterDt : 1.0f; + mIterDtAverage = cloth.mIterDtAvg.empty() ? mIterDt : cloth.mIterDtAvg.average(); + + mCurrentRotation = cloth.mCurrentMotion.q; + mPrevLinearVelocity = cloth.mLinearVelocity; + mPrevAngularVelocity = cloth.mAngularVelocity; + + // update cloth + float invFrameDt = 1.0f / frameDt; + cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p); + PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate(); + cloth.mAngularVelocity = log(dq) * invFrameDt; + + cloth.mPrevIterDt = mIterDt; + cloth.mIterDtAvg.push((uint32_t)mNumIterations, mIterDt); + cloth.mCurrentMotion = cloth.mTargetMotion; +} + +/* +momentum conservation: +m2*x2 - m1*x1 = m1*x1 - m0*x0 + g*dt2, m = r+t +r2*x2+t2 = 2(r1*x1+t1) - (r0*x0+t0) + g*dt2 +r2*x2 = r1*x1 + r1*x1 - r0*x0 - (t2-2t1+t0) + g*dt2 +substitue r1*x1 - r0*x0 = r1*(x1-x0) + (r1-r0)*x0 +and r1*x1 = r2*x1 - (r2-r1)*x1 + +x2 = x1 + r2'*g*dt2 + + r2'r1*(x1-x0) //< damp + + (r2'r1-r2'r0)*x0 - (1-r2'r1)*x1 - r2'*(t2-2t1+t0) //< inertia + + (1-r2'r1)x1 + t2-t1 //< drag (not momentum conserving) + +x2 = x0 + a0*x0 + a1*x1 + b with +a0 = (inertia-damp)*r2'r1 - inertia*r2'r0 - eye +a1 = (1-inertia-drag)*eye + (damp+inertia+drag)*r2'r1 +b = r2'*(g*dt2 - (inertia+drag)*(t2-t1) + inertia*(t1-t0)) + +Velocities are used to deal with multiple iterations and varying dt. Only b needs +to updated from one iteration to the next. Specifically, it is multiplied +by (r2'r1)^1/numIterations. a0 and a1 are unaffected by that multiplication. + +The centrifugal and coriolis forces of non-inertial (turning) reference frame are +not generally captured in these formulas. The 'inertia' term above contains radial +acceleration plus centrifugal and coriolis force for a single iteration. +For multiple iterations, or when the centrifugal forces are scaled differently +than angular inertia, we need to add explicit centrifugal and coriolis forces. +We only use them to correct the above formula because their discretization is +not accurate. + +Possible improvements: multiply coriolis and centrifugal matrix by curInvRotation +from the left. Do the alpha trick of linearInertia also for angularInertia, write +prevParticle after multiplying it with matrix. + +If you change anything in this function, make sure that ClothCustomFloating and +ClothInertia haven't regressed for any choice of solver frequency. +*/ + +template <typename Simd4f, typename MyCloth> +cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const +{ + IterationState<Simd4f> result; + + result.mRemainingIterations = (uint32_t)mNumIterations; + result.mInvNumIterations = mInvNumIterations; + result.mIterDt = mIterDt; + + Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity)); + Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity)); + + Simd4f iterDt = simd4f(mIterDt); + Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt; + + // gravity delta per iteration + Simd4f gravity = load(array(cloth.mGravity)) * (Simd4f)simd4f(sqr(mIterDtAverage)); + + // scale of local particle velocity per iteration + Simd4f dampScale = simdf::exp2(load(array(cloth.mLogDamping)) * dampExponent); + // adjust for the change in time step during the first iteration + Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio); + + // portion of negative frame velocity to transfer to particle + Simd4f linearDrag = + (simd4f(_1) - simdf::exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * iterDt * curLinearVelocity; + + // portion of frame acceleration to transfer to particle + Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity); + + // for inertia, we want to violate newton physics to + // match velocity and position as given by the user, which means: + // vt = v0 + a*t and xt = x0 + v0*t + (!) a*t^2 + // this is achieved by applying a different portion to cur and prev + // position, compared to the normal +0.5 and -0.5 for '... 1/2 a*t^2'. + // specifically, the portion is alpha=(n+1)/2n and 1-alpha. + + float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations; + Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha); + + // rotate to local space (use mRotationMatrix temporarily to hold matrix) + PxMat44 invRotation(mCurrentRotation.getConjugate()); + assign(result.mRotationMatrix, invRotation); + + Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); + + // Previously, we split the bias between previous and current position to + // get correct disretized position and velocity. However, this made a + // hanging cloth experience a downward velocity, which is problematic + // when scaled by the iterDt ratio and results in jitter under variable + // timesteps. Instead, we now apply the entire bias to current position + // and accept a less noticeable error for a free falling cloth. + + Simd4f bias = gravity - linearDrag; + result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ; + result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ; + + result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f; + + if(result.mIsTurning) + { + Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity))); + Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity))); + + // rotation for one iteration in local space + Simd4f curInvAngle = -iterDt * curAngularVelocity; + Simd4f prevInvAngle = -iterDt * prevAngularVelocity; + + PxQuat curInvRotation = exp(castToPxVec3(curInvAngle)); + PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle)); + + PxMat44 curMatrix(curInvRotation); + PxMat44 prevMatrix(prevInvRotation * curInvRotation); + + assign(result.mRotationMatrix, curMatrix); + + Simd4f angularDrag = simd4f(_1) - simdf::exp2(load(array(cloth.mAngularLogDrag)) * dampExponent); + Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia)); + Simd4f angularInertia = load(array(cloth.mAngularInertia)); + Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity; + + Simd4f epsilon = simd4f(sqrt(FLT_MIN)); // requirement: sqr(epsilon) > 0 + Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon; + Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon; + Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon; + Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon; + Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon; + + float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0]; + float inertiaScale = + mInvNumIterations * array(rsqrt(accelerationLengthSqr * inertiaLengthSqr) * inertiaLengthSqr)[0]; + + // magic factor found by comparing to global space simulation: + // some centrifugal force is in inertia part, remainder is 2*(n-1)/n + // after scaling the inertia part, we get for centrifugal: + float centrifugalAlpha = (2 * mNumIterations - 1) * mInvNumIterations; + float centrifugalScale = + centrifugalAlpha * array(rsqrt(velocityLengthSqr * centrifugalLengthSqr) * centrifugalLengthSqr)[0] - + inertiaScale; + + // slightly better in ClothCustomFloating than curInvAngle alone + Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f); + const Simd4f data = lengthSqr(centrifugalVelocity); + float centrifugalSqrLength = array(data)[0] * centrifugalScale; + + Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale); + PxMat33 coriolisMatrix = physx::shdfnd::star(castToPxVec3(coriolisVelocity)); + + const float* dampScalePtr = array(firstDampScale); + const float* centrifugalPtr = array(centrifugalVelocity); + + for(unsigned int j = 0; j < 3; ++j) + { + float centrifugalJ = -centrifugalPtr[j] * centrifugalScale; + for(unsigned int i = 0; i < 3; ++i) + { + float damping = dampScalePtr[j]; + float coriolis = coriolisMatrix(i, j); + float centrifugal = centrifugalPtr[i] * centrifugalJ; + + prevMatrix(i, j) = centrifugal - coriolis + curMatrix(i, j) * (inertiaScale - damping) - + prevMatrix(i, j) * inertiaScale; + curMatrix(i, j) = centrifugal + coriolis + curMatrix(i, j) * (inertiaScale + damping + dragScale); + } + curMatrix(j, j) += centrifugalSqrLength - inertiaScale - dragScale; + prevMatrix(j, j) += centrifugalSqrLength; + } + + assign(result.mPrevMatrix, prevMatrix); + assign(result.mCurMatrix, curMatrix); + } + else + { + Simd4f minusOne = -(Simd4f)simd4f(_1); + result.mRotationMatrix[0] = minusOne; + result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne); + } + + // difference of damp scale between first and other iterations + result.mDampScaleUpdate = (dampScale - firstDampScale) & maskXYZ; + + return result; +} + +template <typename Simd4f> +void cloth::IterationState<Simd4f>::update() +{ + if(mIsTurning) + { + // only need to turn bias, matrix is unaffected (todo: verify) + mCurBias = transform(mRotationMatrix, mCurBias); + mPrevBias = transform(mRotationMatrix, mPrevBias); + } + + // remove time step ratio in damp scale after first iteration + for(uint32_t i = 0; i < 3; ++i) + { + mPrevMatrix[i] = mPrevMatrix[i] - mRotationMatrix[i] * mDampScaleUpdate; + mCurMatrix[i] = mCurMatrix[i] + mRotationMatrix[i] * mDampScaleUpdate; + } + mDampScaleUpdate = simd4f(_0); // only once + + --mRemainingIterations; +} + +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h new file mode 100644 index 00000000..76eb7f4c --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Allocator.h" + +namespace nvidia +{ +namespace cloth +{ + +struct MovingAverage +{ + struct Element + { + uint32_t mCount; + float mValue; + }; + + public: + MovingAverage(uint32_t n = 1) : mCount(0), mSize(n) + { + } + + bool empty() const + { + return mData.empty(); + } + + uint32_t size() const + { + return mSize; + } + + void resize(uint32_t n) + { + PX_ASSERT(n); + mSize = n; + trim(); + } + + void reset() + { + mData.resize(0); + mCount = 0; + } + + void push(uint32_t n, float value) + { + n = PxMin(n, mSize); + + if(mData.empty() || mData.back().mValue != value) + { + Element element = { n, value }; + mData.pushBack(element); + } + else + { + mData.back().mCount += n; + } + + mCount += n; + trim(); + } + + float average() const + { + PX_ASSERT(!mData.empty()); + + float sum = 0.0f; + Vector<Element>::Type::ConstIterator it = mData.begin(), end = mData.end(); + for(; it != end; ++it) + sum += it->mCount * it->mValue; + + // linear weight ramps at both ends for smoother average + uint32_t n = mCount / 8; + float ramp = 0.0f, temp = 0.0f; + uint32_t countLo = (it = mData.begin())->mCount; + uint32_t countHi = (--end)->mCount; + for(uint32_t i = 0; i < n; ++i) + { + if(i == countLo) + countLo += (++it)->mCount; + if(i == countHi) + countHi += (--end)->mCount; + + temp += it->mValue + end->mValue; + ramp += temp; + } + + uint32_t num = (mCount - n) * (n + 1); + return (sum * (n + 1) - ramp) / num; + } + + private: + // remove oldest (front) values until mCount<=mSize + void trim() + { + Vector<Element>::Type::Iterator it = mData.begin(); + for(uint32_t k = mSize; k < mCount; it += k <= mCount) + { + k += it->mCount; + it->mCount = k - mCount; + } + + if(it != mData.begin()) + mData.assign(it, mData.end()); + + mCount = PxMin(mCount, mSize); + } + + Vector<Element>::Type mData; + + uint32_t mCount; + uint32_t mSize; +}; +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp new file mode 100644 index 00000000..310c43d6 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "PhaseConfig.h" +#include "ApexUsingNamespace.h" +#include "PsMathUtils.h" + +namespace nvidia +{ +namespace cloth +{ +PhaseConfig transform(const PhaseConfig&); +} +} + +using namespace nvidia; + +namespace +{ +float safeLog2(float x) +{ + float saturated = PxMax(0.0f, PxMin(x, 1.0f)); + return saturated ? physx::shdfnd::log2(saturated) : -FLT_MAX_EXP; +} +} + +cloth::PhaseConfig::PhaseConfig(uint16_t index) +: mPhaseIndex(index) +, mPadding(0xffff) +, mStiffness(1.0f) +, mStiffnessMultiplier(1.0f) +, mCompressionLimit(1.0f) +, mStretchLimit(1.0f) +{ +} + +// convert from user input to solver format +cloth::PhaseConfig cloth::transform(const PhaseConfig& config) +{ + PhaseConfig result(config.mPhaseIndex); + + result.mStiffness = safeLog2(1.0f - config.mStiffness); + result.mStiffnessMultiplier = safeLog2(config.mStiffnessMultiplier); + + // negative for compression, positive for stretch + result.mCompressionLimit = 1.f - 1.f / config.mCompressionLimit; + result.mStretchLimit = 1.f - 1.f / config.mStretchLimit; + + return result; +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h new file mode 100644 index 00000000..fe130156 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "Simd4f.h" + +namespace nvidia +{ + +namespace cloth +{ + +// acts as a poor mans random access iterator +template <typename Simd4f, typename BaseIterator> +class LerpIterator +{ + + LerpIterator& operator=(const LerpIterator&); // not implemented + + public: + LerpIterator(BaseIterator start, BaseIterator target, float alpha) + : mAlpha(simd4f(alpha)), mStart(start), mTarget(target) + { + } + + // return the interpolated point at a given index + inline Simd4f operator[](size_t index) const + { + return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha; + } + + inline Simd4f operator*() const + { + return (*this)[0]; + } + + // prefix increment only + inline LerpIterator& operator++() + { + ++mStart; + ++mTarget; + return *this; + } + + private: + // interpolation parameter + const Simd4f mAlpha; + + BaseIterator mStart; + BaseIterator mTarget; +}; + +template <typename Simd4f, size_t Stride> +class UnalignedIterator +{ + + UnalignedIterator& operator=(const UnalignedIterator&); // not implemented + + public: + UnalignedIterator(const float* pointer) : mPointer(pointer) + { + } + + inline Simd4f operator[](size_t index) const + { + return load(mPointer + index * Stride); + } + + inline Simd4f operator*() const + { + return (*this)[0]; + } + + // prefix increment only + inline UnalignedIterator& operator++() + { + mPointer += Stride; + return *this; + } + + private: + const float* mPointer; +}; + +// acts as an iterator but returns a constant +template <typename Simd4f> +class ConstantIterator +{ + public: + ConstantIterator(const Simd4f& value) : mValue(value) + { + } + + inline Simd4f operator*() const + { + return mValue; + } + + inline ConstantIterator& operator++() + { + return *this; + } + + private: + ConstantIterator& operator=(const ConstantIterator&); + const Simd4f mValue; +}; + +// wraps an iterator with constant scale and bias +template <typename Simd4f, typename BaseIterator> +class ScaleBiasIterator +{ + public: + ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias) + : mScale(scale), mBias(bias), mBaseIterator(base) + { + } + + inline Simd4f operator*() const + { + return (*mBaseIterator) * mScale + mBias; + } + + inline ScaleBiasIterator& operator++() + { + ++mBaseIterator; + return *this; + } + + private: + ScaleBiasIterator& operator=(const ScaleBiasIterator&); + + const Simd4f mScale; + const Simd4f mBias; + + BaseIterator mBaseIterator; +}; + +} // namespace cloth + +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h new file mode 100644 index 00000000..8755a010 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "SimdTypes.h" + +#if NVMATH_FUSE_MULTIPLY_ADD + +/*! \brief Expression template to fuse multiply-adds. + * \relates Simd4f */ +struct ProductExpr +{ + inline ProductExpr(Simd4f const& v0_, Simd4f const& v1_) : v0(v0_), v1(v1_) + { + } + inline operator Simd4f() const; + const Simd4f v0, v1; + + private: + ProductExpr& operator=(const ProductExpr&); // not implemented +}; + +inline Simd4f operator+(const ProductExpr&, const Simd4f&); +inline Simd4f operator+(const Simd4f& v, const ProductExpr&); +inline Simd4f operator+(const ProductExpr&, const ProductExpr&); +inline Simd4f operator-(const Simd4f& v, const ProductExpr&); +inline Simd4f operator-(const ProductExpr&, const ProductExpr&); + +#else // NVMATH_FUSE_MULTIPLY_ADD +typedef Simd4f ProductExpr; +#endif // NVMATH_FUSE_MULTIPLY_ADD + +template <typename T> +struct Simd4fFactory +{ + Simd4fFactory(T v_) : v(v_) + { + } + inline operator Simd4f() const; + inline operator Scalar4f() const; + Simd4fFactory& operator=(const Simd4fFactory&); // not implemented + T v; +}; + +template <> +struct Simd4fFactory<detail::FourTuple> +{ + Simd4fFactory(float x, float y, float z, float w) + { + v[0] = x, v[1] = y, v[2] = z, v[3] = w; + } + Simd4fFactory(const Simd4fFactory<const float&>& f) + { + v[3] = v[2] = v[1] = v[0] = f.v; + } + inline operator Simd4f() const; + inline operator Scalar4f() const; + Simd4fFactory& operator=(const Simd4fFactory&); // not implemented + PX_ALIGN(16, float) v[4]; +}; + +template <int i> +struct Simd4fFactory<detail::IntType<i> > +{ + inline operator Simd4f() const; + inline operator Scalar4f() const; +}; + +// forward declaration +template <typename> +struct Simd4iFactory; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SIMD +inline Simd4f operator&(const ComplementExpr<Simd4f>&, const Simd4f&); +inline Simd4f operator&(const Simd4f&, const ComplementExpr<Simd4f>&); +#endif + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operators +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +// note: operator?= missing because they don't have corresponding intrinsics. + +/*! \brief Test for equality of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true. +* \relates Simd4f */ +inline Simd4f operator==(const Simd4f& v0, const Simd4f& v1); + +// no operator!= because VMX128 does not support it, use ~operator== and handle QNaPs + +/*! \brief Less-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator<(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Less-or-equal-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator<=(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Greater-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator>(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Greater-or-equal-compare all elements of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline Simd4f operator>=(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector bit-wise NOT operator +* \return A vector holding the bit-negate of \a v. +* \relates Simd4f */ +inline ComplementExpr<Simd4f> operator~(const Simd4f& v); + +/*! \brief Vector bit-wise AND operator +* \return A vector holding the bit-wise AND of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator&(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector bit-wise OR operator +* \return A vector holding the bit-wise OR of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator|(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector bit-wise XOR operator +* \return A vector holding the bit-wise XOR of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator^(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator<<(const Simd4f& v, int shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator>>(const Simd4f& v, int shift); + +#if NVMATH_SHIFT_BY_VECTOR +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator<<(const Simd4f& v, const Simd4f& shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4f */ +inline Simd4f operator>>(const Simd4f& v, const Simd4f& shift); +#endif + +/*! \brief Unary vector addition operator. +* \return A vector holding the component-wise copy of \a v. +* \relates Simd4f */ +inline Simd4f operator+(const Simd4f& v); + +/*! \brief Vector addition operator +* \return A vector holding the component-wise sum of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator+(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Unary vector negation operator. +* \return A vector holding the component-wise negation of \a v. +* \relates Simd4f */ +inline Simd4f operator-(const Simd4f& v); + +/*! \brief Vector subtraction operator. +* \return A vector holding the component-wise difference of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator-(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector multiplication. +* \return Element-wise product of \a v0 and \a v1. +* \note For VMX, returns expression template to fuse multiply-add. +* \relates Simd4f */ +inline ProductExpr operator*(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Vector division. +* \return Element-wise division of \a v0 and \a v1. +* \relates Simd4f */ +inline Simd4f operator/(const Simd4f& v0, const Simd4f& v1); + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// functions +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/*! \brief Load float value into all vector components. +* \relates Simd4f */ +inline Simd4fFactory<const float&> simd4f(const float& s) +{ + return Simd4fFactory<const float&>(s); +} + +/*! \brief Load 4 float values into vector. +* \relates Simd4f */ +inline Simd4fFactory<detail::FourTuple> simd4f(float x, float y, float z, float w) +{ + return Simd4fFactory<detail::FourTuple>(x, y, z, w); +} + +/*! \brief Create vector from literal. +* \return Vector with all elements set to i. +* \relates Simd4f */ +template <int i> +inline Simd4fFactory<detail::IntType<i> > simd4f(detail::IntType<i> const&) +{ + return Simd4fFactory<detail::IntType<i> >(); +} + +/*! \brief Reinterpret Simd4i as Simd4f. +* \return A copy of \a v, but cast as Simd4f. +* \relates Simd4f */ +inline Simd4f simd4f(const Simd4i& v); + +/*! \brief Reinterpret Simd4iFactory as Simd4fFactory. +* \relates Simd4f */ +template <typename T> +inline Simd4fFactory<T> simd4f(const Simd4iFactory<T>& v) +{ + return reinterpret_cast<const Simd4fFactory<T>&>(v); +} + +/*! \brief return reference to contiguous array of vector elements +* \relates Simd4f */ +inline float (&array(Simd4f& v))[4]; + +/*! \brief return constant reference to contiguous array of vector elements +* \relates Simd4f */ +inline const float (&array(const Simd4f& v))[4]; + +/*! \brief Create vector from float array. +* \relates Simd4f */ +inline Simd4fFactory<const float*> load(const float* ptr) +{ + return ptr; +} + +/*! \brief Create vector from aligned float array. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4f */ +inline Simd4fFactory<detail::AlignedPointer<float> > loadAligned(const float* ptr) +{ + return detail::AlignedPointer<float>(ptr); +} + +/*! \brief Create vector from aligned float array. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4f */ +inline Simd4fFactory<detail::OffsetPointer<float> > loadAligned(const float* ptr, unsigned int offset) +{ + return detail::OffsetPointer<float>(ptr, offset); +} + +/*! \brief Store vector \a v to float array \a ptr. +* \relates Simd4f */ +inline void store(float* ptr, Simd4f const& v); + +/*! \brief Store vector \a v to aligned float array \a ptr. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4f */ +inline void storeAligned(float* ptr, Simd4f const& v); + +/*! \brief Store vector \a v to aligned float array \a ptr. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4f */ +inline void storeAligned(float* ptr, unsigned int offset, Simd4f const& v); + +/*! \brief replicate i-th component into all vector components. +* \return Vector with all elements set to \a v[i]. +* \relates Simd4f */ +template <size_t i> +inline Simd4f splat(Simd4f const& v); + +/*! \brief Select \a v0 or \a v1 based on \a mask. +* \return mask ? v0 : v1 +* \relates Simd4f */ +inline Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1); + +/*! \brief Per element absolute value. +* \return Vector with absolute values of \a v. +* \relates Simd4f */ +inline Simd4f abs(const Simd4f& v); + +/*! \brief Per element floor value. +* \note Result undefined for QNaN elements. +* \note On SSE and NEON, returns v-1 if v is negative integer value +* \relates Simd4f */ +inline Simd4f floor(const Simd4f& v); + +/*! \brief Per-component minimum of two vectors +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f max(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Per-component minimum of two vectors +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f min(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Return reciprocal estimate of a vector. +* \return Vector of per-element reciprocal estimate. +* \relates Simd4f */ +inline Simd4f recip(const Simd4f& v); + +/*! \brief Return reciprocal of a vector. +* \return Vector of per-element reciprocal. +* \note Performs \a n Newton-Raphson iterations on initial estimate. +* \relates Simd4f */ +template <int n> +inline Simd4f recipT(const Simd4f& v); + +/*! \brief Return square root of a vector. +* \return Vector of per-element square root. +* \note The behavior is undefined for negative elements. +* \relates Simd4f */ +inline Simd4f sqrt(const Simd4f& v); + +/*! \brief Return inverse square root estimate of a vector. +* \return Vector of per-element inverse square root estimate. +* \note The behavior is undefined for negative, zero, and infinity elements. +* \relates Simd4f */ +inline Simd4f rsqrt(const Simd4f& v); + +/*! \brief Return inverse square root of a vector. +* \return Vector of per-element inverse square root. +* \note Performs \a n Newton-Raphson iterations on initial estimate. +* \note The behavior is undefined for negative and infinity elements. +* \relates Simd4f */ +template <int n> +inline Simd4f rsqrtT(const Simd4f& v); + +/*! \brief Return 2 raised to the power of v. +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f exp2(const Simd4f& v); + +#if NVMATH_SIMD +namespace simdf +{ +// PSP2 is confused resolving about exp2, forwarding works +inline Simd4f exp2(const Simd4f& v) +{ + return ::exp2(v); +} +} +#endif + +/*! \brief Return logarithm of v to base 2. +* \note Result undefined for QNaN elements. +* \relates Simd4f */ +inline Simd4f log2(const Simd4f& v); + +/*! \brief Return dot product of two 3-vectors. +* \note The result is replicated across all 4 components. +* \relates Simd4f */ +inline Simd4f dot3(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Return cross product of two 3-vectors. +* \note The 4th component is undefined. +* \relates Simd4f */ +inline Simd4f cross3(const Simd4f& v0, const Simd4f& v1); + +/*! \brief Transposes 4x4 matrix represented by \a x, \a y, \a z, and \a w. +* \relates Simd4f */ +inline void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true. +* \relates Simd4f */ +inline int allEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true. +* \relates Simd4f */ +inline int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true. +* \relates Simd4f */ +inline int anyEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true. +* \relates Simd4f */ +inline int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int allGreater(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreater(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal +* \param outMask holds the result of \a v0 == \a v1. +* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false. +* \relates Simd4f */ +inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask); + +/*! \brief returns non-zero if all elements are true +* \note Undefined if parameter is not result of a comparison. +* \relates Simd4f */ +inline int allTrue(const Simd4f& v); + +/*! \brief returns non-zero if any element is true +* \note Undefined if parameter is not result of a comparison. +* \relates Simd4f */ +inline int anyTrue(const Simd4f& v); + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// platform specific includes +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SSE2 +#include "sse2/Simd4f.h" +#elif NVMATH_NEON +#include "neon/Simd4f.h" +#endif + +#if NVMATH_SCALAR +#include "scalar/Simd4f.h" +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h new file mode 100644 index 00000000..d237e1fa --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "SimdTypes.h" + +template <typename T> +struct Simd4iFactory +{ + Simd4iFactory(T v_) : v(v_) + { + } + inline operator Simd4i() const; + inline operator Scalar4i() const; + Simd4iFactory& operator=(const Simd4iFactory&); // not implemented + T v; +}; + +template <> +struct Simd4iFactory<detail::FourTuple> +{ + Simd4iFactory(int x, int y, int z, int w) + { + v[0] = x, v[1] = y, v[2] = z, v[3] = w; + } + Simd4iFactory(const Simd4iFactory<const int&>& f) + { + v[3] = v[2] = v[1] = v[0] = f.v; + } + inline operator Simd4i() const; + inline operator Scalar4i() const; + Simd4iFactory& operator=(const Simd4iFactory&); // not implemented + PX_ALIGN(16, int) v[4]; +}; + +template <int i> +struct Simd4iFactory<detail::IntType<i> > +{ + inline operator Simd4i() const; + inline operator Scalar4i() const; +}; + +// forward declaration +template <typename> +struct Simd4fFactory; + +// map Simd4f/Scalar4f to Simd4i/Scalar4i +template <typename> +struct Simd4fToSimd4i; +template <> +struct Simd4fToSimd4i<Simd4f> +{ + typedef Simd4i Type; +}; +template <> +struct Simd4fToSimd4i<Scalar4f> +{ + typedef Scalar4i Type; +}; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_DISTINCT_TYPES +inline Simd4i operator&(const ComplementExpr<Simd4i>&, const Simd4i&); +inline Simd4i operator&(const Simd4i&, const ComplementExpr<Simd4i>&); +#endif + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operators +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_DISTINCT_TYPES + +/*! \brief Vector bit-wise NOT operator +* \return A vector holding the bit-negate of \a v. +* \relates Simd4i */ +inline ComplementExpr<Simd4i> operator~(const Simd4i& v); + +/*! \brief Vector bit-wise AND operator +* \return A vector holding the bit-wise AND of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator&(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector bit-wise OR operator +* \return A vector holding the bit-wise OR of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator|(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector bit-wise XOR operator +* \return A vector holding the bit-wise XOR of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator^(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator<<(const Simd4i& v, int shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator>>(const Simd4i& v, int shift); + +#if NVMATH_SHIFT_BY_VECTOR + +/*! \brief Vector logical left shift. +* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator<<(const Simd4i& v, const Simd4i& shift); + +/*! \brief Vector logical right shift. +* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits. +* \relates Simd4i */ +inline Simd4i operator>>(const Simd4i& v, const Simd4i& shift); + +#endif // NVMATH_SHIFT_BY_VECTOR + +#endif // NVMATH_DISTINCT_TYPES + +namespace simdi // disambiguate for VMX +{ +// note: operator?= missing because they don't have corresponding intrinsics. + +/*! \brief Test for equality of two vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \relates Simd4i */ +inline Simd4i operator==(const Simd4i& v0, const Simd4i& v1); + +// no !=, <=, >= because VMX128/SSE don't support it, use ~equal etc. + +/*! \brief Less-compare all elements of two *signed* vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \relates Simd4i */ +inline Simd4i operator<(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Greater-compare all elements of two *signed* vectors. +* \return Vector of per element result mask (all bits set for 'true', none set for 'false'). +* \relates Simd4i */ +inline Simd4i operator>(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Vector addition operator +* \return A vector holding the component-wise sum of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator+(const Simd4i& v0, const Simd4i& v1); + +/*! \brief Unary vector negation operator. +* \return A vector holding the component-wise negation of \a v. +* \relates Simd4i */ +inline Simd4i operator-(const Simd4i& v); + +/*! \brief Vector subtraction operator. +* \return A vector holding the component-wise difference of \a v0 and \a v1. +* \relates Simd4i */ +inline Simd4i operator-(const Simd4i& v0, const Simd4i& v1); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// functions +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/*! \brief Load int value into all vector components. +* \relates Simd4i */ +inline Simd4iFactory<const int&> simd4i(const int& s) +{ + return Simd4iFactory<const int&>(s); +} + +/*! \brief Load 4 int values into vector. +* \relates Simd4i */ +inline Simd4iFactory<detail::FourTuple> simd4i(int x, int y, int z, int w) +{ + return Simd4iFactory<detail::FourTuple>(x, y, z, w); +} + +/*! \brief Create vector from literal. +* \return Vector with all elements set to \c i. +* \relates Simd4i */ +template <int i> +inline Simd4iFactory<detail::IntType<i> > simd4i(const detail::IntType<i>&) +{ + return Simd4iFactory<detail::IntType<i> >(); +} + +template <> +inline Simd4iFactory<detail::IntType<1> > simd4i(const detail::IntType<1>&) +{ + return Simd4iFactory<detail::IntType<1> >(); +} + +template <> +inline Simd4iFactory<detail::IntType<int(0x80000000)> > simd4i(const detail::IntType<int(0x80000000)>&) +{ + return Simd4iFactory<detail::IntType<int(0x80000000)> >(); +} + +template <> +inline Simd4iFactory<detail::IntType<-1> > simd4i(const detail::IntType<-1>&) +{ + return Simd4iFactory<detail::IntType<-1> >(); +} + +/*! \brief Reinterpret Simd4f as Simd4i. +* \return A copy of \a v, but cast as Simd4i. +* \relates Simd4i */ +inline Simd4i simd4i(const Simd4f& v); + +/*! \brief Reinterpret Simd4fFactory as Simd4iFactory. +* \relates Simd4i */ +template <typename T> +inline Simd4iFactory<T> simd4i(const Simd4fFactory<T>& v) +{ + return reinterpret_cast<const Simd4iFactory<T>&>(v); +} + +namespace simdi +{ + +/*! \brief return reference to contiguous array of vector elements +* \relates Simd4i */ +inline int (&array(Simd4i& v))[4]; + +/*! \brief return constant reference to contiguous array of vector elements +* \relates Simd4i */ +inline const int (&array(const Simd4i& v))[4]; + +} // namespace simdi + +/*! \brief Create vector from int array. +* \relates Simd4i */ +inline Simd4iFactory<const int*> load(const int* ptr) +{ + return ptr; +} + +/*! \brief Create vector from aligned int array. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4i */ +inline Simd4iFactory<detail::AlignedPointer<int> > loadAligned(const int* ptr) +{ + return detail::AlignedPointer<int>(ptr); +} + +/*! \brief Create vector from aligned float array. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4i */ +inline Simd4iFactory<detail::OffsetPointer<int> > loadAligned(const int* ptr, unsigned int offset) +{ + return detail::OffsetPointer<int>(ptr, offset); +} + +/*! \brief Store vector \a v to int array \a ptr. +* \relates Simd4i */ +inline void store(int* ptr, const Simd4i& v); + +/*! \brief Store vector \a v to aligned int array \a ptr. +* \note \a ptr needs to be 16 byte aligned. +* \relates Simd4i */ +inline void storeAligned(int* ptr, const Simd4i& v); + +/*! \brief Store vector \a v to aligned int array \a ptr. +* \param offset pointer offset in bytes. +* \note \a ptr+offset needs to be 16 byte aligned. +* \relates Simd4i */ +inline void storeAligned(int* ptr, unsigned int offset, const Simd4i& v); + +#if NVMATH_DISTINCT_TYPES + +/*! \brief replicate i-th component into all vector components. +* \return Vector with all elements set to \a v[i]. +* \relates Simd4i */ +template <size_t i> +inline Simd4i splat(const Simd4i& v); + +/*! \brief Select \a v0 or \a v1 based on \a mask. +* \return mask ? v0 : v1 +* \relates Simd4i */ +inline Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1); + +#endif // NVMATH_DISTINCT_TYPES + +namespace simdi // disambiguate for VMX +{ +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \relates Simd4i */ +inline int allEqual(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \relates Simd4i */ +inline int anyEqual(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); + +/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater +* \relates Simd4i */ +inline int allGreater(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \relates Simd4i */ +inline int anyGreater(const Simd4i& v0, const Simd4i& v1); + +/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater +* \param outMask holds the result of \a v0 == \a v1. +* \relates Simd4i */ +inline int anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask); +} + +#if NVMATH_DISTINCT_TYPES + +/*! \brief returns non-zero if all elements are true +* \note undefined if parameter is not result of a comparison. +* \relates Simd4i */ +inline int allTrue(const Simd4i& v); + +/*! \brief returns non-zero if any element is true +* \note undefined if parameter is not result of a comparison. +* \relates Simd4i */ +inline int anyTrue(const Simd4i& v); + +#endif // NVMATH_DISTINCT_TYPES + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// platform specific includes +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SSE2 +#include "sse2/Simd4i.h" +#elif NVMATH_NEON +#include "neon/Simd4i.h" +#endif + +#if NVMATH_SCALAR +#include "scalar/Simd4i.h" +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h new file mode 100644 index 00000000..e44e876a --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h @@ -0,0 +1,150 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2015 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <cmath> + +// ps4 compiler defines _M_X64 without value +#if((defined _M_IX86) || (defined _M_X64) || (defined __i386__) || (defined __x86_64__)) +#define NVMATH_SSE2 1 +#else +#define NVMATH_SSE2 0 +#endif +#define NVMATH_NEON (defined _M_ARM || defined __ARM_NEON__) + +// which simd types are implemented (one or both are all valid options) +#define NVMATH_SIMD (NVMATH_SSE2 || NVMATH_NEON) +#define NVMATH_SCALAR !NVMATH_SIMD +// #define NVMATH_SCALAR 1 + +// use template expression to fuse multiply-adds into a single instruction +#define NVMATH_FUSE_MULTIPLY_ADD (NVMATH_NEON) +// support shift by vector operarations +#define NVMATH_SHIFT_BY_VECTOR (NVMATH_NEON) +// Simd4f and Simd4i map to different types +#define NVMATH_DISTINCT_TYPES (NVMATH_SSE2 || NVMATH_NEON) +// support inline assembler +#define NVMATH_INLINE_ASSEMBLER !((defined _M_ARM) || (defined SN_TARGET_PSP2) || (defined __arm64__)) + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +/*! \brief Expression template to fuse and-not. */ +template <typename T> +struct ComplementExpr +{ + inline ComplementExpr(T const& v_) : v(v_) + { + } + inline operator T() const; + const T v; + + private: + ComplementExpr& operator=(const ComplementExpr&); // not implemented +}; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// helper functions +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <typename T> +T sqr(const T& x) +{ + return x * x; +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// details +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +namespace detail +{ +template <typename T> +struct AlignedPointer +{ + AlignedPointer(const T* p) : ptr(p) + { + } + const T* ptr; +}; + +template <typename T> +struct OffsetPointer +{ + OffsetPointer(const T* p, unsigned int off) : ptr(p), offset(off) + { + } + const T* ptr; + unsigned int offset; +}; + +struct FourTuple +{ +}; + +// zero and one literals +template <int i> +struct IntType +{ +}; +} + +// Supress warnings +#if defined(__GNUC__) || defined(__SNC__) +#define NVMATH_UNUSED __attribute__((unused)) +#else +#define NVMATH_UNUSED +#endif + +static detail::IntType<0> _0 NVMATH_UNUSED; +static detail::IntType<1> _1 NVMATH_UNUSED; +static detail::IntType<int(0x80000000)> _sign NVMATH_UNUSED; +static detail::IntType<-1> _true NVMATH_UNUSED; + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// platform specific includes +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +#if NVMATH_SSE2 +#include "sse2/SimdTypes.h" +#elif NVMATH_NEON +#include "neon/SimdTypes.h" +#else +struct Simd4f; +struct Simd4i; +#endif + +#if NVMATH_SCALAR +#include "scalar/SimdTypes.h" +#else +struct Scalar4f; +struct Scalar4i; +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h new file mode 100644 index 00000000..f8c6b2dc --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <PxAssert.h> + +#if PX_LINUX_FAMILY +#include <stdint.h> // intptr_t +#endif + +template <size_t align> +class StackAllocator +{ + typedef unsigned char byte; + + // todo: switch to offsets so size is consistent on x64 + // mSize is just for book keeping so could be 4 bytes + struct Header + { + Header* mPrev; + size_t mSize : 31; + size_t mFree : 1; + }; + + StackAllocator(const StackAllocator&); + StackAllocator& operator=(const StackAllocator&); + + public: + StackAllocator(void* buffer, size_t bufferSize) + : mBuffer(reinterpret_cast<byte*>(buffer)), mBufferSize(bufferSize), mFreeStart(mBuffer), mTop(0) + { + } + + ~StackAllocator() + { + PX_ASSERT(userBytes() == 0); + } + + void* allocate(size_t numBytes) + { + // this is non-standard + if(!numBytes) + return 0; + + uintptr_t unalignedStart = uintptr_t(mFreeStart) + sizeof(Header); + + byte* allocStart = reinterpret_cast<byte*>((unalignedStart + (align - 1)) & ~(align - 1)); + byte* allocEnd = allocStart + numBytes; + + // ensure there is space for the alloc + PX_ASSERT(allocEnd <= mBuffer + mBufferSize); + + Header* h = getHeader(allocStart); + h->mPrev = mTop; + h->mSize = numBytes; + h->mFree = false; + + mTop = h; + mFreeStart = allocEnd; + + return allocStart; + } + + void deallocate(void* p) + { + if(!p) + return; + + Header* h = getHeader(p); + h->mFree = true; + + // unwind the stack to the next live alloc + while(mTop && mTop->mFree) + { + mFreeStart = reinterpret_cast<byte*>(mTop); + mTop = mTop->mPrev; + } + } + + private: + // return the header for an allocation + inline Header* getHeader(void* p) const + { + PX_ASSERT((reinterpret_cast<uintptr_t>(p) & (align - 1)) == 0); + PX_ASSERT(reinterpret_cast<byte*>(p) >= mBuffer + sizeof(Header)); + PX_ASSERT(reinterpret_cast<byte*>(p) < mBuffer + mBufferSize); + + return reinterpret_cast<Header*>(p) - 1; + } + + public: + // total user-allocated bytes not including any overhead + size_t userBytes() const + { + size_t total = 0; + Header* iter = mTop; + while(iter) + { + total += iter->mSize; + iter = iter->mPrev; + } + + return total; + } + + // total user-allocated bytes + overhead + size_t totalUsedBytes() const + { + return mFreeStart - mBuffer; + } + + size_t remainingBytes() const + { + return mBufferSize - totalUsedBytes(); + } + + size_t wastedBytes() const + { + return totalUsedBytes() - userBytes(); + } + + private: + byte* const mBuffer; + const size_t mBufferSize; + + byte* mFreeStart; // start of free space + Header* mTop; // top allocation header +}; diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp new file mode 100644 index 00000000..2283a319 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwCloth.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "TripletScheduler.h" +#include "ClothBase.h" + +namespace nvidia +{ +namespace cloth +{ +PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp +} +} + +using namespace nvidia; +using namespace physx::shdfnd; +using namespace nvidia; + +cloth::SwCloth::SwCloth(SwFactory& factory, SwFabric& fabric, Range<const PxVec4> particles) +: mFactory(factory) +, mFabric(fabric) +, mNumVirtualParticles(0) +#if APEX_UE4 +, mSimulationTask(NULL) +#endif +, mUserData(0) +{ + PX_ASSERT(!particles.empty()); + + initialize(*this, particles.begin(), particles.end()); + +#if PX_WINDOWS_FAMILY + const uint32_t kSimdWidth = 8; // avx +#else + const uint32_t kSimdWidth = 4; // sse +#endif + + mCurParticles.reserve(particles.size() + kSimdWidth - 1); + mCurParticles.assign(reinterpret_cast<const PxVec4*>(particles.begin()), + reinterpret_cast<const PxVec4*>(particles.end())); + + // 7 dummy particles used in SIMD solver + mCurParticles.resize(particles.size() + kSimdWidth - 1, PxVec4(0.0f)); + mPrevParticles = mCurParticles; + + mCurParticles.resize(particles.size()); + mPrevParticles.resize(particles.size()); + + mFabric.incRefCount(); +} + +namespace +{ +// copy vector and make same capacity +void copyVector(cloth::Vec4fAlignedVector& dst, const cloth::Vec4fAlignedVector& src) +{ + dst.reserve(src.capacity()); + dst.assign(src.begin(), src.end()); + + // ensure valid dummy data + dst.resize(src.capacity(), PxVec4(0.0f)); + dst.resize(src.size()); +} +} + +// copy constructor, supports rebinding to a different factory +cloth::SwCloth::SwCloth(SwFactory& factory, const SwCloth& cloth) +: mFactory(factory) +, mFabric(cloth.mFabric) +, mClothCostDirty(true) +, mPhaseConfigs(cloth.mPhaseConfigs) +, mCapsuleIndices(cloth.mCapsuleIndices) +, mStartCollisionSpheres(cloth.mStartCollisionSpheres) +, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres) +, mStartCollisionPlanes(cloth.mStartCollisionPlanes) +, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes) +, mStartCollisionTriangles(cloth.mStartCollisionTriangles) +, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles) +, mVirtualParticleIndices(cloth.mVirtualParticleIndices) +, mVirtualParticleWeights(cloth.mVirtualParticleWeights) +, mNumVirtualParticles(cloth.mNumVirtualParticles) +, mSelfCollisionIndices(cloth.mSelfCollisionIndices) +, mRestPositions(cloth.mRestPositions) +#if APEX_UE4 +, mSimulationTask(NULL) +#endif +{ + copy(*this, cloth); + + // carry over capacity (using as dummy particles) + copyVector(mCurParticles, cloth.mCurParticles); + copyVector(mPrevParticles, cloth.mPrevParticles); + copyVector(mMotionConstraints.mStart, cloth.mMotionConstraints.mStart); + copyVector(mMotionConstraints.mTarget, cloth.mMotionConstraints.mTarget); + copyVector(mSeparationConstraints.mStart, cloth.mSeparationConstraints.mStart); + copyVector(mSeparationConstraints.mTarget, cloth.mSeparationConstraints.mTarget); + copyVector(mParticleAccelerations, cloth.mParticleAccelerations); + + mFabric.incRefCount(); +} + +cloth::SwCloth::~SwCloth() +{ + mFabric.decRefCount(); +} + +cloth::Range<PxVec4> cloth::SwCloth::push(SwConstraints& constraints) +{ + uint32_t n = mCurParticles.size(); + + if(!constraints.mTarget.capacity()) + constraints.mTarget.resize((n + 3) & ~3, PxVec4(0.0f)); // reserve multiple of 4 for SIMD + + constraints.mTarget.resizeUninitialized(n); + PxVec4* data = &constraints.mTarget.front(); + Range<PxVec4> result(data, data + constraints.mTarget.size()); + + if(constraints.mStart.empty()) // initialize start first + constraints.mStart.swap(constraints.mTarget); + + return result; +} + +void cloth::SwCloth::clear(SwConstraints& constraints) +{ + Vec4fAlignedVector().swap(constraints.mStart); + Vec4fAlignedVector().swap(constraints.mTarget); +} + +cloth::Range<const PxVec3> cloth::SwCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t) +{ + return range; +} + +#include "ClothImpl.h" + +namespace nvidia +{ +namespace cloth +{ + +template <> +Cloth* ClothImpl<SwCloth>::clone(Factory& factory) const +{ + return factory.clone(*this); +} + +template <> +uint32_t ClothImpl<SwCloth>::getNumParticles() const +{ + return mCloth.mCurParticles.size(); +} + +template <> +void ClothImpl<SwCloth>::lockParticles() const +{ +} + +template <> +void ClothImpl<SwCloth>::unlockParticles() const +{ +} + +template <> +MappedRange<PxVec4> ClothImpl<SwCloth>::getCurrentParticles() +{ + return getMappedParticles(&mCloth.mCurParticles.front()); +} + +template <> +MappedRange<const PxVec4> ClothImpl<SwCloth>::getCurrentParticles() const +{ + return getMappedParticles(&mCloth.mCurParticles.front()); +} + +template <> +MappedRange<PxVec4> ClothImpl<SwCloth>::getPreviousParticles() +{ + return getMappedParticles(&mCloth.mPrevParticles.front()); +} + +template <> +MappedRange<const PxVec4> ClothImpl<SwCloth>::getPreviousParticles() const +{ + return getMappedParticles(&mCloth.mPrevParticles.front()); +} + +template <> +GpuParticles ClothImpl<SwCloth>::getGpuParticles() +{ + GpuParticles result = { 0, 0, 0 }; + return result; +} + +template <> +void ClothImpl<SwCloth>::setPhaseConfig(Range<const PhaseConfig> configs) +{ + mCloth.mPhaseConfigs.resize(0); + + // transform phase config to use in solver + for(; !configs.empty(); configs.popFront()) + if(configs.front().mStiffness > 0.0f) + mCloth.mPhaseConfigs.pushBack(transform(configs.front())); + + mCloth.wakeUp(); +} + +template <> +void ClothImpl<SwCloth>::setSelfCollisionIndices(Range<const uint32_t> indices) +{ + ContextLockType lock(mCloth.mFactory); + mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end()); + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <> +uint32_t ClothImpl<SwCloth>::getNumVirtualParticles() const +{ + return uint32_t(mCloth.mNumVirtualParticles); +} + +template <> +Range<PxVec4> ClothImpl<SwCloth>::getParticleAccelerations() +{ + if(mCloth.mParticleAccelerations.empty()) + { + uint32_t n = mCloth.mCurParticles.size(); + mCloth.mParticleAccelerations.resize(n, PxVec4(0.0f)); + } + + mCloth.wakeUp(); + + PxVec4* data = &mCloth.mParticleAccelerations.front(); + return Range<PxVec4>(data, data + mCloth.mParticleAccelerations.size()); +} + +template <> +void ClothImpl<SwCloth>::clearParticleAccelerations() +{ + Vec4fAlignedVector().swap(mCloth.mParticleAccelerations); + mCloth.wakeUp(); +} + +template <> +void ClothImpl<SwCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights) +{ + mCloth.mNumVirtualParticles = 0; + + // shuffle indices to form independent SIMD sets + uint16_t numParticles = uint16_t(mCloth.mCurParticles.size()); + TripletScheduler scheduler(indices); + scheduler.simd(numParticles, 4); + + // convert indices to byte offset + Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0); + Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin(); + Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end(); + TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast; + mCloth.mVirtualParticleIndices.resize(0); + mCloth.mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt)); + for(; sIt != sEnd; ++sIt) + { + uint32_t setSize = *sIt; + for(tLast = tIt + setSize; tIt != tLast; ++tIt, ++mCloth.mNumVirtualParticles) + mCloth.mVirtualParticleIndices.pushBack(Vec4us(*tIt)); + mCloth.mVirtualParticleIndices.resize((mCloth.mVirtualParticleIndices.size() + 3) & ~3, dummy); + } + Vector<Vec4us>::Type(mCloth.mVirtualParticleIndices.begin(), mCloth.mVirtualParticleIndices.end()) + .swap(mCloth.mVirtualParticleIndices); + + // precompute 1/dot(w,w) + Vec4fAlignedVector().swap(mCloth.mVirtualParticleWeights); + mCloth.mVirtualParticleWeights.reserve(weights.size()); + for(; !weights.empty(); weights.popFront()) + { + PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front()); + float scale = 1 / w.magnitudeSquared(); + mCloth.mVirtualParticleWeights.pushBack(PxVec4(w.x, w.y, w.z, scale)); + } + + mCloth.notifyChanged(); +} + +#if APEX_UE4 +template <> +void ClothImpl<SwCloth>::simulate(float dt) +{ + (*SwCloth::sSimulationFunction)(mCloth.mSimulationTask, dt); +} +#endif + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h new file mode 100644 index 00000000..3d0569af --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Cloth.h" +#include "Range.h" +#include "MovingAverage.h" +#include "PhaseConfig.h" +#include "IndexPair.h" +#include "Vec4T.h" +#include "Array.h" +#include "PxTransform.h" + +namespace nvidia +{ + +namespace cloth +{ + +class SwFabric; +class SwFactory; +#if APEX_UE4 +class SwCloth; +#endif + +typedef AlignedVector<PxVec4, 16>::Type Vec4fAlignedVector; + +struct SwConstraints +{ + void pop() + { + if(!mTarget.empty()) + { + mStart.swap(mTarget); + mTarget.resize(0); + } + } + + Vec4fAlignedVector mStart; + Vec4fAlignedVector mTarget; +}; + +class SwCloth +{ + SwCloth& operator=(const SwCloth&); // not implemented + struct SwContextLock + { + SwContextLock(const SwFactory&) + { + } + }; + + public: + typedef SwFactory FactoryType; + typedef SwFabric FabricType; + typedef SwContextLock ContextLockType; + + typedef Vec4fAlignedVector& MappedVec4fVectorType; + typedef Vector<IndexPair>::Type& MappedIndexVectorType; + + SwCloth(SwFactory&, SwFabric&, Range<const PxVec4>); + SwCloth(SwFactory&, const SwCloth&); + ~SwCloth(); // not virtual on purpose + + public: + bool isSleeping() const + { + return mSleepPassCounter >= mSleepAfterCount; + } + void wakeUp() + { + mSleepPassCounter = 0; + } + + void notifyChanged() + { + } + + void setParticleBounds(const float*); + + Range<PxVec4> push(SwConstraints&); + static void clear(SwConstraints&); + + static Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t); + + public: + SwFactory& mFactory; + SwFabric& mFabric; + + bool mClothCostDirty; + + // current and previous-iteration particle positions + Vec4fAlignedVector mCurParticles; + Vec4fAlignedVector mPrevParticles; + + PxVec3 mParticleBoundsCenter; + PxVec3 mParticleBoundsHalfExtent; + + PxVec3 mGravity; + PxVec3 mLogDamping; + PxVec3 mLinearLogDrag; + PxVec3 mAngularLogDrag; + PxVec3 mLinearInertia; + PxVec3 mAngularInertia; + PxVec3 mCentrifugalInertia; + float mSolverFrequency; + float mStiffnessFrequency; + + PxTransform mTargetMotion; + PxTransform mCurrentMotion; + PxVec3 mLinearVelocity; + PxVec3 mAngularVelocity; + + float mPrevIterDt; + MovingAverage mIterDtAvg; + + Vector<PhaseConfig>::Type mPhaseConfigs; // transformed! + + // tether constraints stuff + float mTetherConstraintLogStiffness; + float mTetherConstraintScale; + + // motion constraints stuff + SwConstraints mMotionConstraints; + float mMotionConstraintScale; + float mMotionConstraintBias; + float mMotionConstraintLogStiffness; + + // separation constraints stuff + SwConstraints mSeparationConstraints; + + // particle acceleration stuff + Vec4fAlignedVector mParticleAccelerations; + + // collision stuff + Vector<IndexPair>::Type mCapsuleIndices; + Vec4fAlignedVector mStartCollisionSpheres; + Vec4fAlignedVector mTargetCollisionSpheres; + Vector<uint32_t>::Type mConvexMasks; + Vec4fAlignedVector mStartCollisionPlanes; + Vec4fAlignedVector mTargetCollisionPlanes; + Vector<PxVec3>::Type mStartCollisionTriangles; + Vector<PxVec3>::Type mTargetCollisionTriangles; + bool mEnableContinuousCollision; + float mCollisionMassScale; + float mFriction; + + // virtual particles + Vector<Vec4us>::Type mVirtualParticleIndices; + Vec4fAlignedVector mVirtualParticleWeights; + uint32_t mNumVirtualParticles; + + // self collision + float mSelfCollisionDistance; + float mSelfCollisionLogStiffness; + + Vector<uint32_t>::Type mSelfCollisionIndices; + + Vec4fAlignedVector mRestPositions; + + // sleeping + uint32_t mSleepTestInterval; // how often to test for movement + uint32_t mSleepAfterCount; // number of tests to pass before sleep + float mSleepThreshold; // max movement delta to pass test + uint32_t mSleepPassCounter; // how many tests passed + uint32_t mSleepTestCounter; // how many iterations since tested + + // unused for CPU simulation + bool mIsAllowedHalfPrecisionSolver; + +#if APEX_UE4 + void* mSimulationTask; + static void(*const sSimulationFunction)(void*, float); +#endif + + void* mUserData; + +} PX_ALIGN_SUFFIX(16); + +} // namespace cloth + +// bounds = lower[3], upper[3] +inline void cloth::SwCloth::setParticleBounds(const float* bounds) +{ + for(uint32_t i = 0; i < 3; ++i) + { + mParticleBoundsCenter[i] = (bounds[3 + i] + bounds[i]) * 0.5f; + mParticleBoundsHalfExtent[i] = (bounds[3 + i] - bounds[i]) * 0.5f; + } +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp new file mode 100644 index 00000000..bc09612f --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwClothData.h" +#include "SwCloth.h" +#include "SwFabric.h" +#include "Simd4f.h" +#include "PsUtilities.h" + +using namespace nvidia; + +cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric) +{ + mNumParticles = uint32_t(cloth.mCurParticles.size()); + mCurParticles = array(cloth.mCurParticles.front()); + mPrevParticles = array(cloth.mPrevParticles.front()); + + const float* center = array(cloth.mParticleBoundsCenter); + const float* extent = array(cloth.mParticleBoundsHalfExtent); + for(uint32_t i = 0; i < 3; ++i) + { + mCurBounds[i] = center[i] - extent[i]; + mCurBounds[i + 3] = center[i] + extent[i]; + } + + // avoid reading uninitialized data into mCurBounds, even though it's never used. + mPrevBounds[0] = 0.0f; + + mConfigBegin = cloth.mPhaseConfigs.empty() ? 0 : &cloth.mPhaseConfigs.front(); + mConfigEnd = mConfigBegin + cloth.mPhaseConfigs.size(); + + mPhases = &fabric.mPhases.front(); + mNumPhases = uint32_t(fabric.mPhases.size()); + + mSets = &fabric.mSets.front(); + mNumSets = uint32_t(fabric.mSets.size()); + + mRestvalues = &fabric.mRestvalues.front(); + mNumRestvalues = uint32_t(fabric.mRestvalues.size()); + + mIndices = &fabric.mIndices.front(); + mNumIndices = uint32_t(fabric.mIndices.size()); + + float stiffnessExponent = cloth.mStiffnessFrequency * cloth.mPrevIterDt * 0.69314718055994531f; // logf(2.0f); + + mTethers = fabric.mTethers.begin(); + mNumTethers = uint32_t(fabric.mTethers.size()); + mTetherConstraintStiffness = 1.0f - exp(stiffnessExponent * cloth.mTetherConstraintLogStiffness); + mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale; + + mStartMotionConstraints = cloth.mMotionConstraints.mStart.size() ? array(cloth.mMotionConstraints.mStart.front()) : 0; + mTargetMotionConstraints = + !cloth.mMotionConstraints.mTarget.empty() ? array(cloth.mMotionConstraints.mTarget.front()) : 0; + mMotionConstraintStiffness = 1.0f - exp(stiffnessExponent * cloth.mMotionConstraintLogStiffness); + + mStartSeparationConstraints = + cloth.mSeparationConstraints.mStart.size() ? array(cloth.mSeparationConstraints.mStart.front()) : 0; + mTargetSeparationConstraints = + !cloth.mSeparationConstraints.mTarget.empty() ? array(cloth.mSeparationConstraints.mTarget.front()) : 0; + + mParticleAccelerations = cloth.mParticleAccelerations.size() ? array(cloth.mParticleAccelerations.front()) : 0; + + mStartCollisionSpheres = cloth.mStartCollisionSpheres.empty() ? 0 : array(cloth.mStartCollisionSpheres.front()); + mTargetCollisionSpheres = + cloth.mTargetCollisionSpheres.empty() ? mStartCollisionSpheres : array(cloth.mTargetCollisionSpheres.front()); + mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size()); + + mCapsuleIndices = cloth.mCapsuleIndices.empty() ? 0 : &cloth.mCapsuleIndices.front(); + mNumCapsules = uint32_t(cloth.mCapsuleIndices.size()); + + mStartCollisionPlanes = cloth.mStartCollisionPlanes.empty() ? 0 : array(cloth.mStartCollisionPlanes.front()); + mTargetCollisionPlanes = + cloth.mTargetCollisionPlanes.empty() ? mStartCollisionPlanes : array(cloth.mTargetCollisionPlanes.front()); + mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size()); + + mConvexMasks = cloth.mConvexMasks.empty() ? 0 : &cloth.mConvexMasks.front(); + mNumConvexes = uint32_t(cloth.mConvexMasks.size()); + + mStartCollisionTriangles = cloth.mStartCollisionTriangles.empty() ? 0 : array(cloth.mStartCollisionTriangles.front()); + mTargetCollisionTriangles = cloth.mTargetCollisionTriangles.empty() ? mStartCollisionTriangles + : array(cloth.mTargetCollisionTriangles.front()); + mNumTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3; + + mVirtualParticlesBegin = cloth.mVirtualParticleIndices.empty() ? 0 : array(cloth.mVirtualParticleIndices.front()); + mVirtualParticlesEnd = mVirtualParticlesBegin + 4 * cloth.mVirtualParticleIndices.size(); + mVirtualParticleWeights = cloth.mVirtualParticleWeights.empty() ? 0 : array(cloth.mVirtualParticleWeights.front()); + mNumVirtualParticleWeights = uint32_t(cloth.mVirtualParticleWeights.size()); + + mEnableContinuousCollision = cloth.mEnableContinuousCollision; + mCollisionMassScale = cloth.mCollisionMassScale; + mFrictionScale = cloth.mFriction; + + mSelfCollisionDistance = cloth.mSelfCollisionDistance; + mSelfCollisionStiffness = 1.0f - exp(stiffnessExponent * cloth.mSelfCollisionLogStiffness); + + mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin(); + mNumSelfCollisionIndices = mSelfCollisionIndices ? cloth.mSelfCollisionIndices.size() : mNumParticles; + + mRestPositions = cloth.mRestPositions.size() ? array(cloth.mRestPositions.front()) : 0; + + mSleepPassCounter = cloth.mSleepPassCounter; + mSleepTestCounter = cloth.mSleepTestCounter; +} + +void cloth::SwClothData::reconcile(SwCloth& cloth) const +{ + cloth.setParticleBounds(mCurBounds); + cloth.mSleepTestCounter = mSleepTestCounter; + cloth.mSleepPassCounter = mSleepPassCounter; +} + +void cloth::SwClothData::verify() const +{ + // checks needs to be run after the constructor because + + PX_ASSERT(!mNumCapsules || + mNumSpheres > *nvidia::maxElement(&mCapsuleIndices->first, &(mCapsuleIndices + mNumCapsules)->first)); + + PX_ASSERT(!mNumConvexes || (1u << mNumPlanes) - 1 >= *nvidia::maxElement(mConvexMasks, mConvexMasks + mNumConvexes)); +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h new file mode 100644 index 00000000..3aaa6a2b --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Px.h" +#include "Types.h" + +namespace nvidia +{ +namespace cloth +{ + +class SwCloth; +class SwFabric; +struct PhaseConfig; +struct IndexPair; +struct SwTether; + +// reference to cloth instance bulk data (POD) +struct SwClothData +{ + SwClothData(SwCloth&, const SwFabric&); + void reconcile(SwCloth&) const; + void verify() const; + + // particle data + uint32_t mNumParticles; + float* mCurParticles; + float* mPrevParticles; + + float mCurBounds[6]; // lower[3], upper[3] + float mPrevBounds[6]; + float mPadding; // write as simd + + // distance constraints + const PhaseConfig* mConfigBegin; + const PhaseConfig* mConfigEnd; + + const uint32_t* mPhases; + uint32_t mNumPhases; + + const uint32_t* mSets; + uint32_t mNumSets; + + const float* mRestvalues; + uint32_t mNumRestvalues; + + const uint16_t* mIndices; + uint32_t mNumIndices; + + const SwTether* mTethers; + uint32_t mNumTethers; + float mTetherConstraintStiffness; + float mTetherConstraintScale; + + // motion constraint data + const float* mStartMotionConstraints; + const float* mTargetMotionConstraints; + float mMotionConstraintStiffness; + + // separation constraint data + const float* mStartSeparationConstraints; + const float* mTargetSeparationConstraints; + + // particle acceleration data + const float* mParticleAccelerations; + + // collision stuff + const float* mStartCollisionSpheres; + const float* mTargetCollisionSpheres; + uint32_t mNumSpheres; + + const IndexPair* mCapsuleIndices; + uint32_t mNumCapsules; + + const float* mStartCollisionPlanes; + const float* mTargetCollisionPlanes; + uint32_t mNumPlanes; + + const uint32_t* mConvexMasks; + uint32_t mNumConvexes; + + const float* mStartCollisionTriangles; + const float* mTargetCollisionTriangles; + uint32_t mNumTriangles; + + const uint16_t* mVirtualParticlesBegin; + const uint16_t* mVirtualParticlesEnd; + + const float* mVirtualParticleWeights; + uint32_t mNumVirtualParticleWeights; + + bool mEnableContinuousCollision; + float mFrictionScale; + float mCollisionMassScale; + + float mSelfCollisionDistance; + float mSelfCollisionStiffness; + + uint32_t mNumSelfCollisionIndices; + const uint32_t* mSelfCollisionIndices; + + float* mRestPositions; + + // sleep data + uint32_t mSleepPassCounter; + uint32_t mSleepTestCounter; + +} PX_ALIGN_SUFFIX(16); +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp new file mode 100644 index 00000000..581d276b --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp @@ -0,0 +1,1927 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwCollision.h" +#include "SwCloth.h" +#include "SwClothData.h" +#include "IterationState.h" +#include "BoundingBox.h" +#include "PointInterpolator.h" +#include "SwCollisionHelpers.h" +#include "PxAssert.h" +#include <string.h> // for memset + +using namespace nvidia; + +// the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision +template <typename Simd4f> +const Simd4f cloth::SwCollision<Simd4f>::sSkeletonWidth = simd4f(sqr(1 - 0.2f) - 1); + +#if NVMATH_SSE2 +const Simd4i cloth::Gather<Simd4i>::sIntSignBit = simd4i(_sign); +const Simd4i cloth::Gather<Simd4i>::sSignedMask = sIntSignBit | simd4i(0x7); +#elif NVMATH_NEON +const Simd4i cloth::Gather<Simd4i>::sPack = simd4i(0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c); +const Simd4i cloth::Gather<Simd4i>::sOffset = simd4i(0x03020100); +const Simd4i cloth::Gather<Simd4i>::sShift = simd4i(detail::IntType<2>()); +const Simd4i cloth::Gather<Simd4i>::sMask = simd4i(detail::IntType<7>()); +#endif + +namespace +{ +typedef Simd4fFactory<detail::FourTuple> Simd4fConstant; + +const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON); +const Simd4fConstant sMax = simd4f(FLT_MAX); +const Simd4fConstant sMaskX = simd4f(simd4i(~0, 0, 0, 0)); +const Simd4fConstant sMaskZ = simd4f(simd4i(0, 0, ~0, 0)); +const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0)); +const Simd4fConstant sZero = simd4f(0.0f); +const Simd4fConstant sOne = simd4f(1.0f); +const Simd4fConstant sNegOne = simd4f(-1.0f); +const Simd4fConstant sHalf = simd4f(0.5f); +const Simd4fConstant sOneXYZ = simd4f(1.0f, 1.0f, 1.0f, 0.0f); +const Simd4fConstant sGridLength = simd4f(8 - 1e-3f); // sGridSize +const Simd4fConstant sGridExpand = simd4f(1e-4f); +const Simd4fConstant sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); + +#if PX_PROFILE || PX_DEBUG +template <typename Simd4f> +uint32_t horizontalSum(const Simd4f& x) +{ + const float* p = array(x); + return uint32_t(0.5f + p[0] + p[1] + p[2] + p[3]); +} +#endif + +// 7 elements are written to ptr! +template <typename Simd4f> +void storeBounds(float* ptr, const cloth::BoundingBox<Simd4f>& bounds) +{ + store(ptr, bounds.mLower); + store(ptr + 3, bounds.mUpper); +} +} + +struct cloth::SphereData +{ + PxVec3 center; + float radius; +}; + +struct cloth::ConeData +{ + PxVec3 center; + float radius; // cone radius at center + PxVec3 axis; + float slope; // tan(alpha) + + float sqrCosine; // cos^2(alpha) + float halfLength; + + uint32_t firstMask; + uint32_t bothMask; +}; + +struct cloth::TriangleData +{ + PxVec3 base; + float edge0DotEdge1; + + PxVec3 edge0; + float edge0SqrLength; + + PxVec3 edge1; + float edge1SqrLength; + + PxVec3 normal; + float padding; + + float det; + float denom; + + float edge0InvSqrLength; + float edge1InvSqrLength; +}; + +namespace nvidia +{ +namespace cloth +{ +template <typename Simd4f> +BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bbox, const SphereData* sIt, const SphereData* sEnd) +{ + BoundingBox<Simd4f> result = bbox; + for(; sIt != sEnd; ++sIt) + { + Simd4f p = loadAligned(array(sIt->center)); + Simd4f r = splat<3>(p); + result.mLower = min(result.mLower, p - r); + result.mUpper = max(result.mUpper, p + r); + } + return result; +} +} +} + +namespace +{ +template <typename Simd4f, typename SrcIterator> +void generateSpheres(Simd4f* dIt, const SrcIterator& src, uint32_t count) +{ + // have to copy out iterator to ensure alignment is maintained + for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt) + *dIt = max(sMinusFloatMaxXYZ, *sIt); // clamp radius to 0 +} + +void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, const cloth::IndexPair* capsuleIndices, + uint32_t numCones) +{ + cloth::ConeData* cIt = dst; + for(const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt) + { + PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]); + PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]); + + PxVec4 center = (second + first) * 0.5f; + PxVec4 axis = (second - first) * 0.5f; + + float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z; + float sqrConeLength = sqrAxisLength - sqr(axis.w); + + float invAxisLength = 1 / sqrtf(sqrAxisLength); + float invConeLength = 1 / sqrtf(sqrConeLength); + + if(sqrConeLength <= 0.0f) + invAxisLength = invConeLength = 0.0f; + + float axisLength = sqrAxisLength * invAxisLength; + float slope = axis.w * invConeLength; + + cIt->center = PxVec3(center.x, center.y, center.z); + cIt->radius = (axis.w + first.w) * invConeLength * axisLength; + cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength; + cIt->slope = slope; + + cIt->sqrCosine = 1.0f - sqr(axis.w * invAxisLength); + cIt->halfLength = axisLength; + + uint32_t firstMask = 0x1u << iIt->first; + cIt->firstMask = firstMask; + cIt->bothMask = firstMask | 0x1u << iIt->second; + } +} + +template <typename Simd4f, typename SrcIterator> +void generatePlanes(Simd4f* dIt, const SrcIterator& src, uint32_t count) +{ + // have to copy out iterator to ensure alignment is maintained + for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt) + *dIt = *sIt; +} + +template <typename Simd4f, typename SrcIterator> +void generateTriangles(cloth::TriangleData* dIt, const SrcIterator& src, uint32_t count) +{ + // have to copy out iterator to ensure alignment is maintained + for(SrcIterator sIt = src; 0 < count--; ++dIt) + { + Simd4f p0 = *sIt; + ++sIt; + Simd4f p1 = *sIt; + ++sIt; + Simd4f p2 = *sIt; + ++sIt; + + Simd4f edge0 = p1 - p0; + Simd4f edge1 = p2 - p0; + Simd4f normal = cross3(edge0, edge1); + + Simd4f edge0SqrLength = dot3(edge0, edge0); + Simd4f edge1SqrLength = dot3(edge1, edge1); + Simd4f edge0DotEdge1 = dot3(edge0, edge1); + Simd4f normalInvLength = rsqrt(dot3(normal, normal)); + + Simd4f det = edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1; + Simd4f denom = edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1; + + // there are definitely faster ways... + Simd4f aux = select(sMaskX, det, denom); + aux = select(sMaskZ, edge0SqrLength, aux); + aux = select(sMaskW, edge1SqrLength, aux); + + storeAligned(&dIt->base.x, select(sMaskW, edge0DotEdge1, p0)); + storeAligned(&dIt->edge0.x, select(sMaskW, edge0SqrLength, edge0)); + storeAligned(&dIt->edge1.x, select(sMaskW, edge1SqrLength, edge1)); + storeAligned(&dIt->normal.x, normal * normalInvLength); + storeAligned(&dIt->det, recipT<1>(aux)); + } +} + +} // namespace + +template <typename Simd4f> +cloth::SwCollision<Simd4f>::CollisionData::CollisionData() +: mSpheres(0), mCones(0) +{ +} + +template <typename Simd4f> +cloth::SwCollision<Simd4f>::SwCollision(SwClothData& clothData, SwKernelAllocator& alloc, profile::PxProfileZone* profiler) +: mClothData(clothData), mAllocator(alloc), mProfiler(profiler) +{ + allocate(mCurData); + + if(mClothData.mEnableContinuousCollision || mClothData.mFrictionScale > 0.0f) + { + allocate(mPrevData); + + generateSpheres(reinterpret_cast<Simd4f*>(mPrevData.mSpheres), + reinterpret_cast<const Simd4f*>(clothData.mStartCollisionSpheres), clothData.mNumSpheres); + + generateCones(mPrevData.mCones, mPrevData.mSpheres, clothData.mCapsuleIndices, clothData.mNumCapsules); + } +} + +template <typename Simd4f> +cloth::SwCollision<Simd4f>::~SwCollision() +{ + deallocate(mCurData); + deallocate(mPrevData); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::operator()(const IterationState<Simd4f>& state) +{ + mNumCollisions = 0; + + collideConvexes(state); // discrete convex collision, no friction + collideTriangles(state); // discrete triangle collision, no friction + + computeBounds(); + + if(!mClothData.mNumSpheres) + return; + + bool lastIteration = state.mRemainingIterations == 1; + + const Simd4f* targetSpheres = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionSpheres); + + // generate sphere and cone collision data + if(!lastIteration) + { + // interpolate spheres + LerpIterator<Simd4f, const Simd4f*> pIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionSpheres), + targetSpheres, state.getCurrentAlpha()); + generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), pIter, mClothData.mNumSpheres); + } + else + { + // otherwise use the target spheres directly + generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), targetSpheres, mClothData.mNumSpheres); + } + + // generate cones even if test below fails because + // continuous collision might need it in next iteration + generateCones(mCurData.mCones, mCurData.mSpheres, mClothData.mCapsuleIndices, mClothData.mNumCapsules); + + if(buildAcceleration()) + { + if(mClothData.mEnableContinuousCollision) + collideContinuousParticles(); + + mergeAcceleration((uint32_t*)mSphereGrid); + mergeAcceleration((uint32_t*)mConeGrid); + + if(!mClothData.mEnableContinuousCollision) + collideParticles(); + + collideVirtualParticles(); + } + + if(mPrevData.mSpheres) + nvidia::swap(mCurData, mPrevData); +} + +template <typename Simd4f> +size_t cloth::SwCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth) +{ + size_t numTriangles = cloth.mStartCollisionTriangles.size(); + size_t numPlanes = cloth.mStartCollisionPlanes.size(); + + const size_t kTriangleDataSize = sizeof(TriangleData) * numTriangles; + const size_t kPlaneDataSize = sizeof(PxVec4) * numPlanes * 2; + + return PxMax(kTriangleDataSize, kPlaneDataSize); +} + +template <typename Simd4f> +size_t cloth::SwCollision<Simd4f>::estimatePersistentMemory(const SwCloth& cloth) +{ + size_t numCapsules = cloth.mCapsuleIndices.size(); + size_t numSpheres = cloth.mStartCollisionSpheres.size(); + + size_t sphereDataSize = sizeof(SphereData) * numSpheres * 2; + size_t coneDataSize = sizeof(ConeData) * numCapsules * 2; + + return sphereDataSize + coneDataSize; +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::allocate(CollisionData& data) +{ + data.mSpheres = static_cast<SphereData*>(mAllocator.allocate(sizeof(SphereData) * mClothData.mNumSpheres)); + + data.mCones = static_cast<ConeData*>(mAllocator.allocate(sizeof(ConeData) * mClothData.mNumCapsules)); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::deallocate(const CollisionData& data) +{ + mAllocator.deallocate(data.mSpheres); + mAllocator.deallocate(data.mCones); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::computeBounds() +{ +#if PX_PROFILE + ProfileZone zone("cloth::SwSolverKernel::computeBounds", mProfiler); +#endif + + Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles); + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + Simd4f floatMaxXYZ = -(Simd4f)sMinusFloatMaxXYZ; + + Simd4f lower = simd4f(FLT_MAX), upper = -lower; + for(; curIt < curEnd; ++curIt, ++prevIt) + { + Simd4f current = *curIt; + lower = min(lower, current); + upper = max(upper, current); + // if(current.w > 0) current.w = previous.w + *curIt = select(current > floatMaxXYZ, *prevIt, current); + } + + BoundingBox<Simd4f> curBounds; + curBounds.mLower = lower; + curBounds.mUpper = upper; + + // don't change this order, storeBounds writes 7 floats + BoundingBox<Simd4f> prevBounds = loadBounds<Simd4f>(mClothData.mCurBounds); + storeBounds(mClothData.mCurBounds, curBounds); + storeBounds(mClothData.mPrevBounds, prevBounds); +} + +namespace +{ +template <typename Simd4i> +Simd4i andNotIsZero(const Simd4i& left, const Simd4i& right) +{ + return simdi::operator==(left & ~right, simd4i(_0)); +} +} + +// build per-axis mask arrays of spheres on the right/left of grid cell +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::buildSphereAcceleration(const SphereData* sIt) +{ + static const int maxIndex = sGridSize - 1; + + const SphereData* sEnd = sIt + mClothData.mNumSpheres; + for(uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1) + { + Simd4f sphere = loadAligned(array(sIt->center)); + Simd4f radius = splat<3>(sphere); + + Simd4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, sZero)); + Simd4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength)); + + const int* firstIdx = simdi::array(first); + const int* lastIdx = simdi::array(last); + + uint32_t* firstIt = (uint32_t*)mSphereGrid; + uint32_t* lastIt = firstIt + 3 * sGridSize; + + for(uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize) + { + for(int j = firstIdx[i]; j <= maxIndex; ++j) + firstIt[j] |= mask; + + for(int j = lastIdx[i]; j >= 0; --j) + lastIt[j] |= mask; + } + } +} + +// generate cone masks from sphere masks +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::buildConeAcceleration() +{ + const ConeData* coneIt = mCurData.mCones; + const ConeData* coneEnd = coneIt + mClothData.mNumCapsules; + for(uint32_t coneMask = 0x1; coneIt != coneEnd; ++coneIt, coneMask <<= 1) + { + if(coneIt->radius == 0.0f) + continue; + + uint32_t spheresMask = coneIt->bothMask; + + uint32_t* sphereIt = (uint32_t*)mSphereGrid; + uint32_t* sphereEnd = sphereIt + 6 * sGridSize; + uint32_t* gridIt = (uint32_t*)mConeGrid; + for(; sphereIt != sphereEnd; ++sphereIt, ++gridIt) + if(*sphereIt & spheresMask) + *gridIt |= coneMask; + } +} + +// convert right/left mask arrays into single overlap array +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::mergeAcceleration(uint32_t* firstIt) +{ + uint32_t* firstEnd = firstIt + 3 * sGridSize; + uint32_t* lastIt = firstEnd; + for(; firstIt != firstEnd; ++firstIt, ++lastIt) + *firstIt &= *lastIt; +} + +// build mask of spheres/cones touching a regular grid along each axis +template <typename Simd4f> +bool cloth::SwCollision<Simd4f>::buildAcceleration() +{ + // determine sphere bbox + BoundingBox<Simd4f> sphereBounds = + expandBounds(emptyBounds<Simd4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres); + BoundingBox<Simd4f> particleBounds = loadBounds<Simd4f>(mClothData.mCurBounds); + if(mClothData.mEnableContinuousCollision) + { + sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres); + particleBounds = expandBounds(particleBounds, loadBounds<Simd4f>(mClothData.mPrevBounds)); + } + + BoundingBox<Simd4f> bounds = intersectBounds(sphereBounds, particleBounds); + Simd4f edgeLength = (bounds.mUpper - bounds.mLower) & ~(Simd4f)sMaskW; + if(!allGreaterEqual(edgeLength, simd4f(_0))) + return false; + + // calculate an expanded bounds to account for numerical inaccuracy + const Simd4f expandedLower = bounds.mLower - abs(bounds.mLower) * sGridExpand; + const Simd4f expandedUpper = bounds.mUpper + abs(bounds.mUpper) * sGridExpand; + const Simd4f expandedEdgeLength = max(expandedUpper - expandedLower, sEpsilon); + + // make grid minimal thickness and strict upper bound of spheres + mGridScale = sGridLength * recipT<1>(expandedEdgeLength); + mGridBias = -expandedLower * mGridScale; + array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles() + + PX_ASSERT(allTrue(((bounds.mLower * mGridScale + mGridBias) >= simd4f(0.0f)) | sMaskW)); + PX_ASSERT(allTrue(((bounds.mUpper * mGridScale + mGridBias) < simd4f(8.0f)) | sMaskW)); + + memset(mSphereGrid, 0, sizeof(uint32_t) * 6 * (sGridSize)); + if(mClothData.mEnableContinuousCollision) + buildSphereAcceleration(mPrevData.mSpheres); + buildSphereAcceleration(mCurData.mSpheres); + + memset(mConeGrid, 0, sizeof(uint32_t) * 6 * (sGridSize)); + buildConeAcceleration(); + + return true; +} + +#ifdef _MSC_VER +#define FORCE_INLINE __forceinline +#else +#define FORCE_INLINE inline __attribute__((always_inline)) +#endif + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask:: +operator=(const ShapeMask& right) +{ + mCones = right.mCones; + mSpheres = right.mSpheres; + return *this; +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask:: +operator&=(const ShapeMask& right) +{ + mCones = mCones & right.mCones; + mSpheres = mSpheres & right.mSpheres; + return *this; +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask +cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f& position, const Simd4i* __restrict sphereGrid, + const Simd4i* __restrict coneGrid) +{ + Gather<Simd4i> gather(intFloor(position)); + + ShapeMask result; + result.mCones = gather(coneGrid); + result.mSpheres = gather(sphereGrid); + return result; +} + +// lookup acceleration structure and return mask of potential intersectors +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask +cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict positions) const +{ + Simd4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias); + Simd4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias); + Simd4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias); + + ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid); + result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2); + result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4); + + return result; +} + +// lookup acceleration structure and return mask of potential intersectors +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask +cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict prevPos, const Simd4f* __restrict curPos) const +{ + Simd4f scaleX = splat<0>(mGridScale); + Simd4f scaleY = splat<1>(mGridScale); + Simd4f scaleZ = splat<2>(mGridScale); + + Simd4f biasX = splat<0>(mGridBias); + Simd4f biasY = splat<1>(mGridBias); + Simd4f biasZ = splat<2>(mGridBias); + + Simd4f prevX = prevPos[0] * scaleX + biasX; + Simd4f prevY = prevPos[1] * scaleY + biasY; + Simd4f prevZ = prevPos[2] * scaleZ + biasZ; + + Simd4f curX = curPos[0] * scaleX + biasX; + Simd4f curY = curPos[1] * scaleY + biasY; + Simd4f curZ = curPos[2] * scaleZ + biasZ; + + Simd4f maxX = min(max(prevX, curX), sGridLength); + Simd4f maxY = min(max(prevY, curY), sGridLength); + Simd4f maxZ = min(max(prevZ, curZ), sGridLength); + + ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid); + result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2); + result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4); + + Simd4f zero = simd4f(_0); + Simd4f minX = max(min(prevX, curX), zero); + Simd4f minY = max(min(prevY, curY), zero); + Simd4f minZ = max(min(prevZ, curZ), zero); + + result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6); + result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8); + result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10); + + return result; +} + +template <typename Simd4f> +struct cloth::SwCollision<Simd4f>::ImpulseAccumulator +{ + ImpulseAccumulator() + : mDeltaX(simd4f(_0)) + , mDeltaY(mDeltaX) + , mDeltaZ(mDeltaX) + , mVelX(mDeltaX) + , mVelY(mDeltaX) + , mVelZ(mDeltaX) + , mNumCollisions(sEpsilon) + { + } + + void add(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask) + { + PX_ASSERT(allTrue((mask & x) == (mask & x))); + PX_ASSERT(allTrue((mask & y) == (mask & y))); + PX_ASSERT(allTrue((mask & z) == (mask & z))); + PX_ASSERT(allTrue((mask & scale) == (mask & scale))); + + Simd4f maskedScale = scale & mask; + mDeltaX = mDeltaX + x * maskedScale; + mDeltaY = mDeltaY + y * maskedScale; + mDeltaZ = mDeltaZ + z * maskedScale; + mNumCollisions = mNumCollisions + (simd4f(_1) & mask); + } + + void addVelocity(const Simd4f& vx, const Simd4f& vy, const Simd4f& vz, const Simd4f& mask) + { + PX_ASSERT(allTrue((mask & vx) == (mask & vx))); + PX_ASSERT(allTrue((mask & vy) == (mask & vy))); + PX_ASSERT(allTrue((mask & vz) == (mask & vz))); + + mVelX = mVelX + (vx & mask); + mVelY = mVelY + (vy & mask); + mVelZ = mVelZ + (vz & mask); + } + + void subtract(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask) + { + PX_ASSERT(allTrue((mask & x) == (mask & x))); + PX_ASSERT(allTrue((mask & y) == (mask & y))); + PX_ASSERT(allTrue((mask & z) == (mask & z))); + PX_ASSERT(allTrue((mask & scale) == (mask & scale))); + + Simd4f maskedScale = scale & mask; + mDeltaX = mDeltaX - x * maskedScale; + mDeltaY = mDeltaY - y * maskedScale; + mDeltaZ = mDeltaZ - z * maskedScale; + mNumCollisions = mNumCollisions + (simd4f(_1) & mask); + } + + Simd4f mDeltaX, mDeltaY, mDeltaZ; + Simd4f mVelX, mVelY, mVelZ; + Simd4f mNumCollisions; +}; + +template <typename Simd4f> +FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* positions, + ImpulseAccumulator& accum) const +{ + const float* __restrict spherePtr = array(mCurData.mSpheres->center); + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + Simd4i mask4 = horizontalOr(sphereMask); + uint32_t mask = uint32_t(simdi::array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData); + mask = mask & test; + + Simd4f sphere = loadAligned(spherePtr, offset); + + Simd4f deltaX = positions[0] - splat<0>(sphere); + Simd4f deltaY = positions[1] - splat<1>(sphere); + Simd4f deltaZ = positions[2] - splat<2>(sphere); + + Simd4f sqrDistance = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * splat<3>(sphere); + + Simd4f contactMask; + if(!anyGreater(simd4f(_0), negativeScale, contactMask)) + continue; + + accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask); + + if(frictionEnabled) + { + // load previous sphere pos + const float* __restrict prevSpherePtr = array(mPrevData.mSpheres->center); + + Simd4f prevSphere = loadAligned(prevSpherePtr, offset); + Simd4f velocity = sphere - prevSphere; + + accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask); + } + } +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i +cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict positions, ImpulseAccumulator& accum) const +{ + const float* __restrict centerPtr = array(mCurData.mCones->center); + const float* __restrict axisPtr = array(mCurData.mCones->axis); + const float* __restrict auxiliaryPtr = &mCurData.mCones->sqrCosine; + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + ShapeMask shapeMask = getShapeMask(positions); + Simd4i mask4 = horizontalOr(shapeMask.mCones); + uint32_t mask = uint32_t(simdi::array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t coneIndex = findBitSet(mask & ~test); + uint32_t offset = coneIndex * sizeof(ConeData); + mask = mask & test; + + Simd4i test4 = simdi::operator-(mask4, simd4i(_1)); + Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4)); + mask4 = mask4 & test4; + + Simd4f center = loadAligned(centerPtr, offset); + + Simd4f deltaX = positions[0] - splat<0>(center); + Simd4f deltaY = positions[1] - splat<1>(center); + Simd4f deltaZ = positions[2] - splat<2>(center); + + Simd4f axis = loadAligned(axisPtr, offset); + + Simd4f axisX = splat<0>(axis); + Simd4f axisY = splat<1>(axis); + Simd4f axisZ = splat<2>(axis); + Simd4f slope = splat<3>(axis); + + Simd4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; + Simd4f radius = dot * slope + splat<3>(center); + + // set radius to zero if cone is culled + radius = max(radius, sZero) & ~culled; + + Simd4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot; + + Simd4i auxiliary = simd4i((Simd4f)loadAligned(auxiliaryPtr, offset)); + Simd4i bothMask = splat<3>(auxiliary); + + Simd4f contactMask; + if(!anyGreater(radius * radius, sqrDistance, contactMask)) + { + // cone only culled when spheres culled, ok to clear those too + shapeMask.mSpheres = shapeMask.mSpheres & ~bothMask; + continue; + } + + // clamp to a small positive epsilon to avoid numerical error + // making sqrDistance negative when point lies on the cone axis + sqrDistance = max(sqrDistance, sEpsilon); + + Simd4f invDistance = rsqrt(sqrDistance); + Simd4f base = dot + slope * sqrDistance * invDistance; + + // force left/rightMask to false if not inside cone + base = base & contactMask; + + Simd4f halfLength = splat<1>(simd4f(auxiliary)); + Simd4i leftMask = simd4i(base < -halfLength); + Simd4i rightMask = simd4i(base > halfLength); + + // we use both mask because of the early out above. + Simd4i firstMask = splat<2>(auxiliary); + Simd4i secondMask = firstMask ^ bothMask; + shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask); + shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask); + + deltaX = deltaX - base * axisX; + deltaY = deltaY - base * axisY; + deltaZ = deltaZ - base * axisZ; + + Simd4f sqrCosine = splat<0>(simd4f(auxiliary)); + Simd4f scale = radius * invDistance * sqrCosine - sqrCosine; + + contactMask = contactMask & ~simd4f(leftMask | rightMask); + + if(!anyTrue(contactMask)) + continue; + + accum.add(deltaX, deltaY, deltaZ, scale, contactMask); + + if(frictionEnabled) + { + uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first; + uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second; + + float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres); + float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres); + + // todo: could pre-compute sphere velocities or it might be + // faster to compute cur/prev sphere positions directly + Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData)); + Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData)); + + Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData)); + Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData)); + + Simd4f v0 = s0p1 - s0p0; + Simd4f v1 = s1p1 - s1p0; + Simd4f vd = v1 - v0; + + // dot is in the range -1 to 1, scale and bias to 0 to 1 + dot = dot * sHalf + sHalf; + + // interpolate velocity at contact points + Simd4f vx = splat<0>(v0) + dot * splat<0>(vd); + Simd4f vy = splat<1>(v0) + dot * splat<1>(vd); + Simd4f vz = splat<2>(v0) + dot * splat<2>(vd); + + accum.addVelocity(vx, vy, vz, contactMask); + } + } + + return shapeMask.mSpheres; +} + +template <typename Simd4f> +FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* __restrict prevPos, + Simd4f* __restrict curPos, ImpulseAccumulator& accum) const +{ + const float* __restrict prevSpheres = array(mPrevData.mSpheres->center); + const float* __restrict curSpheres = array(mCurData.mSpheres->center); + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + Simd4i mask4 = horizontalOr(sphereMask); + uint32_t mask = uint32_t(simdi::array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData); + mask = mask & test; + + Simd4f prevSphere = loadAligned(prevSpheres, offset); + Simd4f prevX = prevPos[0] - splat<0>(prevSphere); + Simd4f prevY = prevPos[1] - splat<1>(prevSphere); + Simd4f prevZ = prevPos[2] - splat<2>(prevSphere); + Simd4f prevRadius = splat<3>(prevSphere); + + Simd4f curSphere = loadAligned(curSpheres, offset); + Simd4f curX = curPos[0] - splat<0>(curSphere); + Simd4f curY = curPos[1] - splat<1>(curSphere); + Simd4f curZ = curPos[2] - splat<2>(curSphere); + Simd4f curRadius = splat<3>(curSphere); + + Simd4f sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ; + + Simd4f dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius; + Simd4f dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius; + Simd4f dotCurCur = sqrDistance - curRadius * curRadius; + + Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + Simd4f sqrtD = sqrt(discriminant); + Simd4f halfB = dotPrevCur - dotPrevPrev; + Simd4f minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside sphere + Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD); + Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD); + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth); + + // a is negative when one sphere is contained in the other, + // which is already handled by discrete collision. + collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon); + + if(!allEqual(collisionMask, simd4f(_0))) + { + Simd4f deltaX = prevX - curX; + Simd4f deltaY = prevY - curY; + Simd4f deltaZ = prevZ - curZ; + + Simd4f oneMinusToi = (simd4f(_1) - toi) & collisionMask; + + // reduce ccd impulse if (clamped) particle trajectory stays in sphere skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > sEpsilon); + oneMinusToi = oneMinusToi * recip(sOne - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curPos[0] = splat<0>(curSphere) + curX; + curPos[1] = splat<1>(curSphere) + curY; + curPos[2] = splat<2>(curSphere) + curZ; + + sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ; + } + + Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * curRadius; + + Simd4f contactMask; + if(!anyGreater(simd4f(_0), negativeScale, contactMask)) + continue; + + accum.subtract(curX, curY, curZ, negativeScale, contactMask); + + if(frictionEnabled) + { + Simd4f velocity = curSphere - prevSphere; + accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask); + } + } +} + +template <typename Simd4f> +FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i +cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict prevPos, Simd4f* __restrict curPos, + ImpulseAccumulator& accum) const +{ + const float* __restrict prevCenterPtr = array(mPrevData.mCones->center); + const float* __restrict prevAxisPtr = array(mPrevData.mCones->axis); + const float* __restrict prevAuxiliaryPtr = &mPrevData.mCones->sqrCosine; + + const float* __restrict curCenterPtr = array(mCurData.mCones->center); + const float* __restrict curAxisPtr = array(mCurData.mCones->axis); + const float* __restrict curAuxiliaryPtr = &mCurData.mCones->sqrCosine; + + bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + + ShapeMask shapeMask = getShapeMask(prevPos, curPos); + Simd4i mask4 = horizontalOr(shapeMask.mCones); + uint32_t mask = uint32_t(simdi::array(mask4)[0]); + while(mask) + { + uint32_t test = mask - 1; + uint32_t coneIndex = findBitSet(mask & ~test); + uint32_t offset = coneIndex * sizeof(ConeData); + mask = mask & test; + + Simd4i test4 = simdi::operator-(mask4, simd4i(_1)); + Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4)); + mask4 = mask4 & test4; + + Simd4f prevCenter = loadAligned(prevCenterPtr, offset); + Simd4f prevAxis = loadAligned(prevAxisPtr, offset); + Simd4f prevAxisX = splat<0>(prevAxis); + Simd4f prevAxisY = splat<1>(prevAxis); + Simd4f prevAxisZ = splat<2>(prevAxis); + Simd4f prevSlope = splat<3>(prevAxis); + + Simd4f prevX = prevPos[0] - splat<0>(prevCenter); + Simd4f prevY = prevPos[1] - splat<1>(prevCenter); + Simd4f prevZ = prevPos[2] - splat<2>(prevCenter); + Simd4f prevT = prevY * prevAxisZ - prevZ * prevAxisY; + Simd4f prevU = prevZ * prevAxisX - prevX * prevAxisZ; + Simd4f prevV = prevX * prevAxisY - prevY * prevAxisX; + Simd4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; + Simd4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter); + + Simd4f curCenter = loadAligned(curCenterPtr, offset); + Simd4f curAxis = loadAligned(curAxisPtr, offset); + Simd4f curAxisX = splat<0>(curAxis); + Simd4f curAxisY = splat<1>(curAxis); + Simd4f curAxisZ = splat<2>(curAxis); + Simd4f curSlope = splat<3>(curAxis); + Simd4i curAuxiliary = simd4i((Simd4f)loadAligned(curAuxiliaryPtr, offset)); + + Simd4f curX = curPos[0] - splat<0>(curCenter); + Simd4f curY = curPos[1] - splat<1>(curCenter); + Simd4f curZ = curPos[2] - splat<2>(curCenter); + Simd4f curT = curY * curAxisZ - curZ * curAxisY; + Simd4f curU = curZ * curAxisX - curX * curAxisZ; + Simd4f curV = curX * curAxisY - curY * curAxisX; + Simd4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + Simd4f curRadius = curDot * curSlope + splat<3>(curCenter); + + Simd4f curSqrDistance = sEpsilon + curT * curT + curU * curU + curV * curV; + + // set radius to zero if cone is culled + prevRadius = max(prevRadius, simd4f(_0)) & ~culled; + curRadius = max(curRadius, simd4f(_0)) & ~culled; + + Simd4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius; + Simd4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius; + Simd4f dotCurCur = curSqrDistance - curRadius * curRadius; + + Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + Simd4f sqrtD = sqrt(discriminant); + Simd4f halfB = dotPrevCur - dotPrevPrev; + Simd4f minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside cone + Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD); + Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD); + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon); + + // test if any particle hits infinite cone (and 0<time of impact<1) + if(!allEqual(collisionMask, simd4f(_0))) + { + Simd4f deltaX = prevX - curX; + Simd4f deltaY = prevY - curY; + Simd4f deltaZ = prevZ - curZ; + + // interpolate delta at toi + Simd4f posX = prevX - deltaX * toi; + Simd4f posY = prevY - deltaY * toi; + Simd4f posZ = prevZ - deltaZ * toi; + + Simd4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary)); + Simd4i prevAuxiliary = simd4i((Simd4f)loadAligned(prevAuxiliaryPtr, offset)); + Simd4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary)); + + Simd4f oneMinusToi = simd4f(_1) - toi; + + // interpolate axis at toi + Simd4f axisX = splat<0>(curScaledAxis) - splat<0>(deltaScaledAxis) * oneMinusToi; + Simd4f axisY = splat<1>(curScaledAxis) - splat<1>(deltaScaledAxis) * oneMinusToi; + Simd4f axisZ = splat<2>(curScaledAxis) - splat<2>(deltaScaledAxis) * oneMinusToi; + Simd4f slope = (prevSlope * oneMinusToi + curSlope * toi); + + Simd4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + Simd4f invHalfLength = rsqrt(sqrHalfLength); + Simd4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength; + + Simd4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot; + Simd4f invDistance = rsqrt(sqrDistance) & (sqrDistance > simd4f(_0)); + + Simd4f base = dot + slope * sqrDistance * invDistance; + Simd4f scale = base * invHalfLength & collisionMask; + + Simd4f cullMask = (abs(scale) < simd4f(_1)) & collisionMask; + + // test if any impact position is in cone section + if(!allEqual(cullMask, simd4f(_0))) + { + deltaX = deltaX + splat<0>(deltaScaledAxis) * scale; + deltaY = deltaY + splat<1>(deltaScaledAxis) * scale; + deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale; + + oneMinusToi = oneMinusToi & cullMask; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + // oneMinusToi = oneMinusToi * recip(sOne - sqrtD * recip(minusA * oneMinusToi)); + Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > sEpsilon); + oneMinusToi = oneMinusToi * recip(sOne - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + curRadius = curDot * curSlope + splat<3>(curCenter); + curRadius = max(curRadius, simd4f(_0)) & ~culled; + curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot; + + curPos[0] = splat<0>(curCenter) + curX; + curPos[1] = splat<1>(curCenter) + curY; + curPos[2] = splat<2>(curCenter) + curZ; + } + } + + // curPos inside cone (discrete collision) + Simd4f contactMask; + int anyContact = anyGreater(curRadius * curRadius, curSqrDistance, contactMask); + + Simd4i bothMask = splat<3>(curAuxiliary); + + // instead of culling continuous collision for ~collisionMask, and discrete + // collision for ~contactMask, disable both if ~collisionMask & ~contactMask + Simd4i cullMask = bothMask & ~simd4i(collisionMask | contactMask); + shapeMask.mSpheres = shapeMask.mSpheres & ~cullMask; + + if(!anyContact) + continue; + + Simd4f invDistance = rsqrt(curSqrDistance) & (curSqrDistance > sZero); + Simd4f base = curDot + curSlope * curSqrDistance * invDistance; + + Simd4f halfLength = splat<1>(simd4f(curAuxiliary)); + Simd4i leftMask = simd4i(base < -halfLength); + Simd4i rightMask = simd4i(base > halfLength); + + // can only skip continuous sphere collision if post-ccd position + // is on code side *and* particle had cone-ccd collision. + Simd4i firstMask = splat<2>(curAuxiliary); + Simd4i secondMask = firstMask ^ bothMask; + cullMask = (firstMask & ~leftMask) | (secondMask & ~rightMask); + shapeMask.mSpheres = shapeMask.mSpheres & ~(cullMask & simd4i(collisionMask)); + + Simd4f deltaX = curX - base * curAxisX; + Simd4f deltaY = curY - base * curAxisY; + Simd4f deltaZ = curZ - base * curAxisZ; + + Simd4f sqrCosine = splat<0>(simd4f(curAuxiliary)); + Simd4f scale = curRadius * invDistance * sqrCosine - sqrCosine; + + contactMask = contactMask & ~simd4f(leftMask | rightMask); + + if(!anyTrue(contactMask)) + continue; + + accum.add(deltaX, deltaY, deltaZ, scale, contactMask); + + if(frictionEnabled) + { + uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first; + uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second; + + float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres); + float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres); + + // todo: could pre-compute sphere velocities or it might be + // faster to compute cur/prev sphere positions directly + Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData)); + Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData)); + + Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData)); + Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData)); + + Simd4f v0 = s0p1 - s0p0; + Simd4f v1 = s1p1 - s1p0; + Simd4f vd = v1 - v0; + + // dot is in the range -1 to 1, scale and bias to 0 to 1 + curDot = curDot * sHalf + sHalf; + + // interpolate velocity at contact points + Simd4f vx = splat<0>(v0) + curDot * splat<0>(vd); + Simd4f vy = splat<1>(v0) + curDot * splat<1>(vd); + Simd4f vz = splat<2>(v0) + curDot * splat<2>(vd); + + accum.addVelocity(vx, vy, vz, contactMask); + } + } + + return shapeMask.mSpheres; +} + +namespace +{ + +template <typename Simd4f> +PX_INLINE void calculateFrictionImpulse(const Simd4f& deltaX, const Simd4f& deltaY, const Simd4f& deltaZ, + const Simd4f& velX, const Simd4f& velY, const Simd4f& velZ, + const Simd4f* curPos, const Simd4f* prevPos, const Simd4f& scale, + const Simd4f& coefficient, const Simd4f& mask, Simd4f* impulse) +{ + // calculate collision normal + Simd4f deltaSq = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + + Simd4f rcpDelta = rsqrt(deltaSq + sEpsilon); + + Simd4f nx = deltaX * rcpDelta; + Simd4f ny = deltaY * rcpDelta; + Simd4f nz = deltaZ * rcpDelta; + + // calculate relative velocity scaled by number of collisions + Simd4f rvx = curPos[0] - prevPos[0] - velX * scale; + Simd4f rvy = curPos[1] - prevPos[1] - velY * scale; + Simd4f rvz = curPos[2] - prevPos[2] - velZ * scale; + + // calculate magnitude of relative normal velocity + Simd4f rvn = rvx * nx + rvy * ny + rvz * nz; + + // calculate relative tangential velocity + Simd4f rvtx = rvx - rvn * nx; + Simd4f rvty = rvy - rvn * ny; + Simd4f rvtz = rvz - rvn * nz; + + // calculate magnitude of vt + Simd4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + sEpsilon); + + // magnitude of friction impulse (cannot be greater than -vt) + Simd4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, sNegOne) & mask; + + impulse[0] = rvtx * j; + impulse[1] = rvty * j; + impulse[2] = rvtz * j; +} + +} // anonymous namespace + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideParticles() +{ + const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f; + const Simd4f massScale = simd4f(mClothData.mCollisionMassScale); + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + Simd4f curPos[4]; + Simd4f prevPos[4]; + + float* __restrict prevIt = mClothData.mPrevParticles; + float* __restrict pIt = mClothData.mCurParticles; + float* __restrict pEnd = pIt + mClothData.mNumParticles * 4; + for(; pIt < pEnd; pIt += 16, prevIt += 16) + { + curPos[0] = loadAligned(pIt, 0); + curPos[1] = loadAligned(pIt, 16); + curPos[2] = loadAligned(pIt, 32); + curPos[3] = loadAligned(pIt, 48); + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + + ImpulseAccumulator accum; + Simd4i sphereMask = collideCones(curPos, accum); + collideSpheres(sphereMask, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, sEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + if(frictionEnabled) + { + prevPos[0] = loadAligned(prevIt, 0); + prevPos[1] = loadAligned(prevIt, 16); + prevPos[2] = loadAligned(prevIt, 32); + prevPos[3] = loadAligned(prevIt, 48); + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + + Simd4f frictionImpulse[3]; + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + prevPos[0] = prevPos[0] - frictionImpulse[0]; + prevPos[1] = prevPos[1] - frictionImpulse[1]; + prevPos[2] = prevPos[2] - frictionImpulse[2]; + + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + storeAligned(prevIt, 0, prevPos[0]); + storeAligned(prevIt, 16, prevPos[1]); + storeAligned(prevIt, 32, prevPos[2]); + storeAligned(prevIt, 48, prevPos[3]); + } + + if(massScalingEnabled) + { + // calculate the inverse mass scale based on the collision impulse magnitude + Simd4f dSq = invNumCollisions * invNumCollisions * + (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ); + + Simd4f scale = recip(sOne + massScale * dSq); + + // scale invmass + curPos[3] = select(mask, curPos[3] * scale, curPos[3]); + } + + curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions; + curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions; + curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions; + + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + storeAligned(pIt, 0, curPos[0]); + storeAligned(pIt, 16, curPos[1]); + storeAligned(pIt, 32, curPos[2]); + storeAligned(pIt, 48, curPos[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideVirtualParticles() +{ + const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f; + const Simd4f massScale = simd4f(mClothData.mCollisionMassScale); + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + Simd4f curPos[3]; + + const float* __restrict weights = mClothData.mVirtualParticleWeights; + float* __restrict particles = mClothData.mCurParticles; + float* __restrict prevParticles = mClothData.mPrevParticles; + + // move dummy particles outside of collision range + Simd4f* __restrict dummy = mClothData.mNumParticles + reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f invGridScale = recip(mGridScale) & (mGridScale > sEpsilon); + dummy[0] = dummy[1] = dummy[2] = invGridScale * mGridBias - invGridScale; + + const uint16_t* __restrict vpIt = mClothData.mVirtualParticlesBegin; + const uint16_t* __restrict vpEnd = mClothData.mVirtualParticlesEnd; + for(; vpIt != vpEnd; vpIt += 16) + { + // load 12 particles and 4 weights + Simd4f p0v0 = loadAligned(particles, vpIt[0] * sizeof(PxVec4)); + Simd4f p0v1 = loadAligned(particles, vpIt[1] * sizeof(PxVec4)); + Simd4f p0v2 = loadAligned(particles, vpIt[2] * sizeof(PxVec4)); + Simd4f w0 = loadAligned(weights, vpIt[3] * sizeof(PxVec4)); + + Simd4f p1v0 = loadAligned(particles, vpIt[4] * sizeof(PxVec4)); + Simd4f p1v1 = loadAligned(particles, vpIt[5] * sizeof(PxVec4)); + Simd4f p1v2 = loadAligned(particles, vpIt[6] * sizeof(PxVec4)); + Simd4f w1 = loadAligned(weights, vpIt[7] * sizeof(PxVec4)); + + Simd4f p2v0 = loadAligned(particles, vpIt[8] * sizeof(PxVec4)); + Simd4f p2v1 = loadAligned(particles, vpIt[9] * sizeof(PxVec4)); + Simd4f p2v2 = loadAligned(particles, vpIt[10] * sizeof(PxVec4)); + Simd4f w2 = loadAligned(weights, vpIt[11] * sizeof(PxVec4)); + + Simd4f p3v1 = loadAligned(particles, vpIt[13] * sizeof(PxVec4)); + Simd4f p3v0 = loadAligned(particles, vpIt[12] * sizeof(PxVec4)); + Simd4f p3v2 = loadAligned(particles, vpIt[14] * sizeof(PxVec4)); + Simd4f w3 = loadAligned(weights, vpIt[15] * sizeof(PxVec4)); + + // interpolate particles and transpose + Simd4f px = p0v0 * splat<0>(w0) + p0v1 * splat<1>(w0) + p0v2 * splat<2>(w0); + Simd4f py = p1v0 * splat<0>(w1) + p1v1 * splat<1>(w1) + p1v2 * splat<2>(w1); + Simd4f pz = p2v0 * splat<0>(w2) + p2v1 * splat<1>(w2) + p2v2 * splat<2>(w2); + Simd4f pw = p3v0 * splat<0>(w3) + p3v1 * splat<1>(w3) + p3v2 * splat<2>(w3); + transpose(px, py, pz, pw); + + curPos[0] = px; + curPos[1] = py; + curPos[2] = pz; + + ImpulseAccumulator accum; + Simd4i sphereMask = collideCones(curPos, accum); + collideSpheres(sphereMask, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, sEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + // displacement and transpose back + Simd4f d0 = accum.mDeltaX * invNumCollisions; + Simd4f d1 = accum.mDeltaY * invNumCollisions; + Simd4f d2 = accum.mDeltaZ * invNumCollisions; + Simd4f d3 = sZero; + transpose(d0, d1, d2, d3); + + // scale weights by 1/dot(w,w) + Simd4f rw0 = w0 * splat<3>(w0); + Simd4f rw1 = w1 * splat<3>(w1); + Simd4f rw2 = w2 * splat<3>(w2); + Simd4f rw3 = w3 * splat<3>(w3); + + if(frictionEnabled) + { + Simd4f q0v0 = loadAligned(prevParticles, vpIt[0] * sizeof(PxVec4)); + Simd4f q0v1 = loadAligned(prevParticles, vpIt[1] * sizeof(PxVec4)); + Simd4f q0v2 = loadAligned(prevParticles, vpIt[2] * sizeof(PxVec4)); + + Simd4f q1v0 = loadAligned(prevParticles, vpIt[4] * sizeof(PxVec4)); + Simd4f q1v1 = loadAligned(prevParticles, vpIt[5] * sizeof(PxVec4)); + Simd4f q1v2 = loadAligned(prevParticles, vpIt[6] * sizeof(PxVec4)); + + Simd4f q2v0 = loadAligned(prevParticles, vpIt[8] * sizeof(PxVec4)); + Simd4f q2v1 = loadAligned(prevParticles, vpIt[9] * sizeof(PxVec4)); + Simd4f q2v2 = loadAligned(prevParticles, vpIt[10] * sizeof(PxVec4)); + + Simd4f q3v0 = loadAligned(prevParticles, vpIt[12] * sizeof(PxVec4)); + Simd4f q3v1 = loadAligned(prevParticles, vpIt[13] * sizeof(PxVec4)); + Simd4f q3v2 = loadAligned(prevParticles, vpIt[14] * sizeof(PxVec4)); + + // calculate previous interpolated positions + Simd4f qx = q0v0 * splat<0>(w0) + q0v1 * splat<1>(w0) + q0v2 * splat<2>(w0); + Simd4f qy = q1v0 * splat<0>(w1) + q1v1 * splat<1>(w1) + q1v2 * splat<2>(w1); + Simd4f qz = q2v0 * splat<0>(w2) + q2v1 * splat<1>(w2) + q2v2 * splat<2>(w2); + Simd4f qw = q3v0 * splat<0>(w3) + q3v1 * splat<1>(w3) + q3v2 * splat<2>(w3); + transpose(qx, qy, qz, qw); + + Simd4f prevPos[3] = { qx, qy, qz }; + Simd4f frictionImpulse[4]; + frictionImpulse[3] = sZero; + + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]); + + q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]); + q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]); + q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]); + + q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]); + q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]); + q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]); + + q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]); + q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]); + q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]); + + q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]); + q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]); + q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]); + + // write back prev particles + storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0); + storeAligned(prevParticles, vpIt[1] * sizeof(PxVec4), q0v1); + storeAligned(prevParticles, vpIt[2] * sizeof(PxVec4), q0v2); + + storeAligned(prevParticles, vpIt[4] * sizeof(PxVec4), q1v0); + storeAligned(prevParticles, vpIt[5] * sizeof(PxVec4), q1v1); + storeAligned(prevParticles, vpIt[6] * sizeof(PxVec4), q1v2); + + storeAligned(prevParticles, vpIt[8] * sizeof(PxVec4), q2v0); + storeAligned(prevParticles, vpIt[9] * sizeof(PxVec4), q2v1); + storeAligned(prevParticles, vpIt[10] * sizeof(PxVec4), q2v2); + + storeAligned(prevParticles, vpIt[12] * sizeof(PxVec4), q3v0); + storeAligned(prevParticles, vpIt[13] * sizeof(PxVec4), q3v1); + storeAligned(prevParticles, vpIt[14] * sizeof(PxVec4), q3v2); + } + + if(massScalingEnabled) + { + // calculate the inverse mass scale based on the collision impulse + Simd4f dSq = invNumCollisions * invNumCollisions * + (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ); + + Simd4f weightScale = recip(sOne + massScale * dSq); + + weightScale = weightScale - sOne; + Simd4f s0 = sOne + splat<0>(weightScale) * (w0 & splat<0>(mask)); + Simd4f s1 = sOne + splat<1>(weightScale) * (w1 & splat<1>(mask)); + Simd4f s2 = sOne + splat<2>(weightScale) * (w2 & splat<2>(mask)); + Simd4f s3 = sOne + splat<3>(weightScale) * (w3 & splat<3>(mask)); + + p0v0 = p0v0 * (sOneXYZ | (splat<0>(s0) & sMaskW)); + p0v1 = p0v1 * (sOneXYZ | (splat<1>(s0) & sMaskW)); + p0v2 = p0v2 * (sOneXYZ | (splat<2>(s0) & sMaskW)); + + p1v0 = p1v0 * (sOneXYZ | (splat<0>(s1) & sMaskW)); + p1v1 = p1v1 * (sOneXYZ | (splat<1>(s1) & sMaskW)); + p1v2 = p1v2 * (sOneXYZ | (splat<2>(s1) & sMaskW)); + + p2v0 = p2v0 * (sOneXYZ | (splat<0>(s2) & sMaskW)); + p2v1 = p2v1 * (sOneXYZ | (splat<1>(s2) & sMaskW)); + p2v2 = p2v2 * (sOneXYZ | (splat<2>(s2) & sMaskW)); + + p3v0 = p3v0 * (sOneXYZ | (splat<0>(s3) & sMaskW)); + p3v1 = p3v1 * (sOneXYZ | (splat<1>(s3) & sMaskW)); + p3v2 = p3v2 * (sOneXYZ | (splat<2>(s3) & sMaskW)); + } + + p0v0 = p0v0 + (splat<0>(rw0) * d0); + p0v1 = p0v1 + (splat<1>(rw0) * d0); + p0v2 = p0v2 + (splat<2>(rw0) * d0); + + p1v0 = p1v0 + (splat<0>(rw1) * d1); + p1v1 = p1v1 + (splat<1>(rw1) * d1); + p1v2 = p1v2 + (splat<2>(rw1) * d1); + + p2v0 = p2v0 + (splat<0>(rw2) * d2); + p2v1 = p2v1 + (splat<1>(rw2) * d2); + p2v2 = p2v2 + (splat<2>(rw2) * d2); + + p3v0 = p3v0 + (splat<0>(rw3) * d3); + p3v1 = p3v1 + (splat<1>(rw3) * d3); + p3v2 = p3v2 + (splat<2>(rw3) * d3); + + // write back particles + storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0); + storeAligned(particles, vpIt[1] * sizeof(PxVec4), p0v1); + storeAligned(particles, vpIt[2] * sizeof(PxVec4), p0v2); + + storeAligned(particles, vpIt[4] * sizeof(PxVec4), p1v0); + storeAligned(particles, vpIt[5] * sizeof(PxVec4), p1v1); + storeAligned(particles, vpIt[6] * sizeof(PxVec4), p1v2); + + storeAligned(particles, vpIt[8] * sizeof(PxVec4), p2v0); + storeAligned(particles, vpIt[9] * sizeof(PxVec4), p2v1); + storeAligned(particles, vpIt[10] * sizeof(PxVec4), p2v2); + + storeAligned(particles, vpIt[12] * sizeof(PxVec4), p3v0); + storeAligned(particles, vpIt[13] * sizeof(PxVec4), p3v1); + storeAligned(particles, vpIt[14] * sizeof(PxVec4), p3v2); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideContinuousParticles() +{ + Simd4f curPos[4]; + Simd4f prevPos[4]; + + const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f; + const Simd4f massScale = simd4f(mClothData.mCollisionMassScale); + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + float* __restrict prevIt = mClothData.mPrevParticles; + float* __restrict curIt = mClothData.mCurParticles; + float* __restrict curEnd = curIt + mClothData.mNumParticles * 4; + + for(; curIt < curEnd; curIt += 16, prevIt += 16) + { + prevPos[0] = loadAligned(prevIt, 0); + prevPos[1] = loadAligned(prevIt, 16); + prevPos[2] = loadAligned(prevIt, 32); + prevPos[3] = loadAligned(prevIt, 48); + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + + curPos[0] = loadAligned(curIt, 0); + curPos[1] = loadAligned(curIt, 16); + curPos[2] = loadAligned(curIt, 32); + curPos[3] = loadAligned(curIt, 48); + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + + ImpulseAccumulator accum; + Simd4i sphereMask = collideCones(prevPos, curPos, accum); + collideSpheres(sphereMask, prevPos, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, sEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + if(frictionEnabled) + { + Simd4f frictionImpulse[3]; + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + prevPos[0] = prevPos[0] - frictionImpulse[0]; + prevPos[1] = prevPos[1] - frictionImpulse[1]; + prevPos[2] = prevPos[2] - frictionImpulse[2]; + + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + storeAligned(prevIt, 0, prevPos[0]); + storeAligned(prevIt, 16, prevPos[1]); + storeAligned(prevIt, 32, prevPos[2]); + storeAligned(prevIt, 48, prevPos[3]); + } + + if(massScalingEnabled) + { + // calculate the inverse mass scale based on the collision impulse magnitude + Simd4f dSq = invNumCollisions * invNumCollisions * + (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ); + + Simd4f weightScale = recip(sOne + massScale * dSq); + + // scale invmass + curPos[3] = select(mask, curPos[3] * weightScale, curPos[3]); + } + + curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions; + curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions; + curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions; + + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + storeAligned(curIt, 0, curPos[0]); + storeAligned(curIt, 16, curPos[1]); + storeAligned(curIt, 32, curPos[2]); + storeAligned(curIt, 48, curPos[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideConvexes(const IterationState<Simd4f>& state) +{ + if(!mClothData.mNumConvexes) + return; + + // times 2 for plane equation result buffer + Simd4f* planes = static_cast<Simd4f*>(mAllocator.allocate(sizeof(Simd4f) * mClothData.mNumPlanes * 2)); + + const Simd4f* targetPlanes = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionPlanes); + + // generate plane collision data + if(state.mRemainingIterations != 1) + { + // interpolate planes + LerpIterator<Simd4f, const Simd4f*> planeIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionPlanes), + targetPlanes, state.getCurrentAlpha()); + + // todo: normalize plane equations + generatePlanes(planes, planeIter, mClothData.mNumPlanes); + } + else + { + // otherwise use the target planes directly + generatePlanes(planes, targetPlanes, mClothData.mNumPlanes); + } + + Simd4f curPos[4], prevPos[4]; + + const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; + const Simd4f frictionScale = simd4f(mClothData.mFrictionScale); + + float* __restrict curIt = mClothData.mCurParticles; + float* __restrict curEnd = curIt + mClothData.mNumParticles * 4; + float* __restrict prevIt = mClothData.mPrevParticles; + for(; curIt < curEnd; curIt += 16, prevIt += 16) + { + curPos[0] = loadAligned(curIt, 0); + curPos[1] = loadAligned(curIt, 16); + curPos[2] = loadAligned(curIt, 32); + curPos[3] = loadAligned(curIt, 48); + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + + ImpulseAccumulator accum; + collideConvexes(planes, curPos, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, sEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + if(frictionEnabled) + { + prevPos[0] = loadAligned(prevIt, 0); + prevPos[1] = loadAligned(prevIt, 16); + prevPos[2] = loadAligned(prevIt, 32); + prevPos[3] = loadAligned(prevIt, 48); + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + + Simd4f frictionImpulse[3]; + calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ, + curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse); + + prevPos[0] = prevPos[0] - frictionImpulse[0]; + prevPos[1] = prevPos[1] - frictionImpulse[1]; + prevPos[2] = prevPos[2] - frictionImpulse[2]; + + transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]); + storeAligned(prevIt, 0, prevPos[0]); + storeAligned(prevIt, 16, prevPos[1]); + storeAligned(prevIt, 32, prevPos[2]); + storeAligned(prevIt, 48, prevPos[3]); + } + + curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions; + curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions; + curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions; + + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + storeAligned(curIt, 0, curPos[0]); + storeAligned(curIt, 16, curPos[1]); + storeAligned(curIt, 32, curPos[2]); + storeAligned(curIt, 48, curPos[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } + + mAllocator.deallocate(planes); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideConvexes(const Simd4f* __restrict planes, Simd4f* __restrict curPos, + ImpulseAccumulator& accum) +{ + Simd4i result = simd4i(_0); + Simd4i mask4 = simd4i(_1); + + const Simd4f* __restrict pIt, *pEnd = planes + mClothData.mNumPlanes; + Simd4f* __restrict dIt = const_cast<Simd4f*>(pEnd); + for(pIt = planes; pIt != pEnd; ++pIt, ++dIt) + { + *dIt = splat<3>(*pIt) + curPos[2] * splat<2>(*pIt) + curPos[1] * splat<1>(*pIt) + curPos[0] * splat<0>(*pIt); + result = result | (mask4 & simd4i(*dIt < simd4f(_0))); + mask4 = mask4 << 1; // todo: shift by Simd4i on consoles + } + + if(simdi::allEqual(result, simd4i(_0))) + return; + + const uint32_t* __restrict cIt = mClothData.mConvexMasks; + const uint32_t* __restrict cEnd = cIt + mClothData.mNumConvexes; + for(; cIt != cEnd; ++cIt) + { + uint32_t mask = *cIt; + mask4 = simd4i(int(mask)); + if(!simdi::anyEqual(mask4 & result, mask4, mask4)) + continue; + + uint32_t test = mask - 1; + uint32_t planeIndex = findBitSet(mask & ~test); + Simd4f plane = planes[planeIndex]; + Simd4f planeX = splat<0>(plane); + Simd4f planeY = splat<1>(plane); + Simd4f planeZ = splat<2>(plane); + Simd4f planeD = pEnd[planeIndex]; + while(mask &= test) + { + test = mask - 1; + planeIndex = findBitSet(mask & ~test); + plane = planes[planeIndex]; + Simd4f dist = pEnd[planeIndex]; + Simd4f closer = dist > planeD; + planeX = select(closer, splat<0>(plane), planeX); + planeY = select(closer, splat<1>(plane), planeY); + planeZ = select(closer, splat<2>(plane), planeZ); + planeD = max(dist, planeD); + } + + accum.subtract(planeX, planeY, planeZ, planeD, simd4f(mask4)); + } +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideTriangles(const IterationState<Simd4f>& state) +{ + if(!mClothData.mNumTriangles) + return; + + TriangleData* triangles = + static_cast<TriangleData*>(mAllocator.allocate(sizeof(TriangleData) * mClothData.mNumTriangles)); + + UnalignedIterator<Simd4f, 3> targetTriangles(mClothData.mTargetCollisionTriangles); + + // generate triangle collision data + if(state.mRemainingIterations != 1) + { + // interpolate triangles + LerpIterator<Simd4f, UnalignedIterator<Simd4f, 3> > triangleIter(mClothData.mStartCollisionTriangles, + targetTriangles, state.getCurrentAlpha()); + + generateTriangles<Simd4f>(triangles, triangleIter, mClothData.mNumTriangles); + } + else + { + // otherwise use the target triangles directly + generateTriangles<Simd4f>(triangles, targetTriangles, mClothData.mNumTriangles); + } + + Simd4f positions[4]; + + float* __restrict pIt = mClothData.mCurParticles; + float* __restrict pEnd = pIt + mClothData.mNumParticles * 4; + for(; pIt < pEnd; pIt += 16) + { + positions[0] = loadAligned(pIt, 0); + positions[1] = loadAligned(pIt, 16); + positions[2] = loadAligned(pIt, 32); + positions[3] = loadAligned(pIt, 48); + transpose(positions[0], positions[1], positions[2], positions[3]); + + ImpulseAccumulator accum; + collideTriangles(triangles, positions, accum); + + Simd4f mask; + if(!anyGreater(accum.mNumCollisions, sEpsilon, mask)) + continue; + + Simd4f invNumCollisions = recip(accum.mNumCollisions); + + positions[0] = positions[0] + accum.mDeltaX * invNumCollisions; + positions[1] = positions[1] + accum.mDeltaY * invNumCollisions; + positions[2] = positions[2] + accum.mDeltaZ * invNumCollisions; + + transpose(positions[0], positions[1], positions[2], positions[3]); + storeAligned(pIt, 0, positions[0]); + storeAligned(pIt, 16, positions[1]); + storeAligned(pIt, 32, positions[2]); + storeAligned(pIt, 48, positions[3]); + +#if PX_PROFILE || PX_DEBUG + mNumCollisions += horizontalSum(accum.mNumCollisions); +#endif + } + + mAllocator.deallocate(triangles); +} + +template <typename Simd4f> +void cloth::SwCollision<Simd4f>::collideTriangles(const TriangleData* __restrict triangles, Simd4f* __restrict curPos, + ImpulseAccumulator& accum) +{ + Simd4f normalX, normalY, normalZ, normalD; + normalX = normalY = normalZ = normalD = simd4f(_0); + Simd4f minSqrLength = sMax; + + const TriangleData* __restrict tIt, *tEnd = triangles + mClothData.mNumTriangles; + for(tIt = triangles; tIt != tEnd; ++tIt) + { + Simd4f base = loadAligned(&tIt->base.x); + Simd4f edge0 = loadAligned(&tIt->edge0.x); + Simd4f edge1 = loadAligned(&tIt->edge1.x); + Simd4f normal = loadAligned(&tIt->normal.x); + Simd4f aux = loadAligned(&tIt->det); + + Simd4f dx = curPos[0] - splat<0>(base); + Simd4f dy = curPos[1] - splat<1>(base); + Simd4f dz = curPos[2] - splat<2>(base); + + Simd4f e0x = splat<0>(edge0); + Simd4f e0y = splat<1>(edge0); + Simd4f e0z = splat<2>(edge0); + + Simd4f e1x = splat<0>(edge1); + Simd4f e1y = splat<1>(edge1); + Simd4f e1z = splat<2>(edge1); + + Simd4f nx = splat<0>(normal); + Simd4f ny = splat<1>(normal); + Simd4f nz = splat<2>(normal); + + Simd4f deltaDotEdge0 = dx * e0x + dy * e0y + dz * e0z; + Simd4f deltaDotEdge1 = dx * e1x + dy * e1y + dz * e1z; + Simd4f deltaDotNormal = dx * nx + dy * ny + dz * nz; + + Simd4f edge0DotEdge1 = splat<3>(base); + Simd4f edge0SqrLength = splat<3>(edge0); + Simd4f edge1SqrLength = splat<3>(edge1); + + Simd4f s = edge1SqrLength * deltaDotEdge0 - edge0DotEdge1 * deltaDotEdge1; + Simd4f t = edge0SqrLength * deltaDotEdge1 - edge0DotEdge1 * deltaDotEdge0; + + Simd4f sPositive = s > simd4f(_0); + Simd4f tPositive = t > simd4f(_0); + + Simd4f det = splat<0>(aux); + + s = select(tPositive, s * det, deltaDotEdge0 * splat<2>(aux)); + t = select(sPositive, t * det, deltaDotEdge1 * splat<3>(aux)); + + Simd4f clamp = simd4f(_1) < s + t; + Simd4f numerator = edge1SqrLength - edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1; + + s = select(clamp, numerator * splat<1>(aux), s); + + s = max(simd4f(_0), min(simd4f(_1), s)); + t = max(simd4f(_0), min(simd4f(_1) - s, t)); + + dx = dx - e0x * s - e1x * t; + dy = dy - e0y * s - e1y * t; + dz = dz - e0z * s - e1z * t; + + Simd4f sqrLength = dx * dx + dy * dy + dz * dz; + + // slightly increase distance for colliding triangles + Simd4f slack = (simd4f(_0) > deltaDotNormal) & simd4f(1e-4f); + sqrLength = sqrLength + sqrLength * slack; + + Simd4f mask = sqrLength < minSqrLength; + + normalX = select(mask, nx, normalX); + normalY = select(mask, ny, normalY); + normalZ = select(mask, nz, normalZ); + normalD = select(mask, deltaDotNormal, normalD); + + minSqrLength = min(sqrLength, minSqrLength); + } + + Simd4f mask; + if(!anyGreater(simd4f(_0), normalD, mask)) + return; + + accum.subtract(normalX, normalY, normalZ, normalD, mask); +} + +// explicit template instantiation +#if NVMATH_SIMD +template class cloth::SwCollision<Simd4f>; +#endif +#if NVMATH_SCALAR +template class cloth::SwCollision<Scalar4f>; +#endif +/* +namespace +{ + using namespace cloth; + + int test() + { + Simd4f vertices[] = { + simd4f(0.0f, 0.0f, 0.0f, 0.0f), + simd4f(0.1f, 0.0f, 0.0f, 0.0f), + simd4f(0.0f, 0.1f, 0.0f, 0.0f) + }; + TriangleData triangle; + generateTriangles<Simd4f>(&triangle, &*vertices, 1); + + char buffer[1000]; + SwKernelAllocator alloc(buffer, 1000); + + SwClothData* cloth = static_cast<SwClothData*>(malloc(sizeof(SwClothData))); + memset(cloth, 0, sizeof(SwClothData)); + cloth->mNumTriangles = 1; + + SwCollision<Simd4f> collision(*cloth, alloc); + SwCollision<Simd4f>::ImpulseAccumulator accum; + + Simd4f particles[4] = {}; + for(float y=-0.1f; y < 0.0f; y += 0.2f) + { + for(float x=-0.1f; x < 0.0f; x += 0.2f) + { + particles[0] = simd4f(x); + particles[1] = simd4f(y); + particles[2] = simd4f(-1.0f); + + collision.collideTriangles(&triangle, particles, accum); + } + } + + return 0; + } + + static int blah = test(); +} +*/ diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h new file mode 100644 index 00000000..bf5f3177 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "StackAllocator.h" +#include "Simd4i.h" + +#if PX_PROFILE +#include "PxProfileEventSender.h" +#include "PxProfileZone.h" +#else +namespace physx +{ +namespace profile +{ + class PxProfileZone; +} +} +#endif + +namespace nvidia +{ +namespace cloth +{ + +#if PX_PROFILE + +struct ProfileZone +{ + ProfileZone(const char* name, profile::PxProfileZone* profiler) + : mSender(profiler), mEventId(profiler ? profiler->getEventIdForName(name) : uint16_t(-1)) + { + if(mSender) + mSender->startEvent(mEventId, (uint64_t)intptr_t(this)); + } + + ~ProfileZone() + { + if(mSender) + mSender->stopEvent(mEventId, (uint64_t)intptr_t(this)); + } + + void setValue(int64_t value) const + { + if(mSender) + mSender->eventValue(mEventId, (uint64_t)intptr_t(this), value); + } + + profile::PxProfileEventSender* mSender; + uint16_t mEventId; +}; + +#else // PX_PROFILE + +struct ProfileZone +{ + ProfileZone(const char*, profile::PxProfileZone*) + { + } + void setValue(int64_t) const + { + } +}; + +#endif // PX_PROFILE + +class SwCloth; +struct SwClothData; +template <typename> +struct IterationState; +struct IndexPair; +struct SphereData; +struct ConeData; +struct TriangleData; + +typedef StackAllocator<16> SwKernelAllocator; + +/** + Collision handler for SwSolver. + */ +template <typename Simd4f> +class SwCollision +{ + typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i; + + public: + struct ShapeMask + { + Simd4i mCones; + Simd4i mSpheres; + + ShapeMask& operator=(const ShapeMask&); + ShapeMask& operator&=(const ShapeMask&); + }; + + struct CollisionData + { + CollisionData(); + SphereData* mSpheres; + ConeData* mCones; + }; + + struct ImpulseAccumulator; + + public: + SwCollision(SwClothData& clothData, SwKernelAllocator& alloc, profile::PxProfileZone* profiler); + ~SwCollision(); + + void operator()(const IterationState<Simd4f>& state); + + static size_t estimateTemporaryMemory(const SwCloth& cloth); + static size_t estimatePersistentMemory(const SwCloth& cloth); + + private: + SwCollision& operator=(const SwCollision&); // not implemented + void allocate(CollisionData&); + void deallocate(const CollisionData&); + + void computeBounds(); + + void buildSphereAcceleration(const SphereData*); + void buildConeAcceleration(); + static void mergeAcceleration(uint32_t*); + bool buildAcceleration(); + + static ShapeMask getShapeMask(const Simd4f&, const Simd4i*, const Simd4i*); + ShapeMask getShapeMask(const Simd4f*) const; + ShapeMask getShapeMask(const Simd4f*, const Simd4f*) const; + + void collideSpheres(const Simd4i&, const Simd4f*, ImpulseAccumulator&) const; + Simd4i collideCones(const Simd4f*, ImpulseAccumulator&) const; + + void collideSpheres(const Simd4i&, const Simd4f*, Simd4f*, ImpulseAccumulator&) const; + Simd4i collideCones(const Simd4f*, Simd4f*, ImpulseAccumulator&) const; + + void collideParticles(); + void collideVirtualParticles(); + void collideContinuousParticles(); + + void collideConvexes(const IterationState<Simd4f>&); + void collideConvexes(const Simd4f*, Simd4f*, ImpulseAccumulator&); + + void collideTriangles(const IterationState<Simd4f>&); + void collideTriangles(const TriangleData*, Simd4f*, ImpulseAccumulator&); + + public: + // acceleration structure + static const uint32_t sGridSize = 8; + Simd4i mSphereGrid[6 * sGridSize / 4]; + Simd4i mConeGrid[6 * sGridSize / 4]; + Simd4f mGridScale, mGridBias; + + CollisionData mPrevData; + CollisionData mCurData; + + SwClothData& mClothData; + SwKernelAllocator& mAllocator; + + uint32_t mNumCollisions; + + profile::PxProfileZone* mProfiler; + + static const Simd4f sSkeletonWidth; +}; +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h new file mode 100644 index 00000000..5e098922 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Simd4i.h" + +// platform specific helpers + +namespace nvidia +{ +namespace cloth +{ + +inline uint32_t findBitSet(uint32_t mask); + +// intFloor(-1.0f) returns -2 on SSE and NEON! +inline Simd4i intFloor(const Simd4f& v); + +inline Simd4i horizontalOr(Simd4i mask); + +template <typename> +struct Gather; + +#if NVMATH_SIMD +template <> +struct Gather<Simd4i> +{ + inline Gather(const Simd4i& index); + inline Simd4i operator()(const Simd4i*) const; + +#if NVMATH_SSE2 + Simd4i mSelectQ, mSelectD, mSelectW; + static const Simd4i sIntSignBit; + static const Simd4i sSignedMask; +#elif NVMATH_NEON + Simd4i mPermute; + static const Simd4i sPack; + static const Simd4i sOffset; + static const Simd4i sShift; + static const Simd4i sMask; +#endif + Simd4i mOutOfRange; +}; +#endif + +} // namespace cloth +} // namespace nvidia + +#if NVMATH_SSE2 +#include "sse2/SwCollisionHelpers.h" +#elif NVMATH_NEON +#include "neon/SwCollisionHelpers.h" +#endif + +#if NVMATH_SCALAR +#include "scalar/SwCollisionHelpers.h" +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp new file mode 100644 index 00000000..0d527dbf --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "PxAssert.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "PsSort.h" +#include "limits.h" // for USHRT_MAX + +#include "PsUtilities.h" + +using namespace nvidia; +using namespace physx::shdfnd; + +cloth::SwTether::SwTether(uint16_t anchor, float length) : mAnchor(anchor), mLength(length) +{ +} + +cloth::SwFabric::SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths, uint32_t id) +: mFactory(factory), mNumParticles(numParticles), mTetherLengthScale(1.0f), mId(id) +{ + // should no longer be prefixed with 0 + PX_ASSERT(sets.front() != 0); + +#if PX_WINDOWS_FAMILY + const uint32_t kSimdWidth = 8; // avx +#else + const uint32_t kSimdWidth = 4; +#endif + + // consistency check + PX_ASSERT(sets.back() == restvalues.size()); + PX_ASSERT(restvalues.size() * 2 == indices.size()); + PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end())); + PX_ASSERT(mNumParticles + kSimdWidth - 1 <= USHRT_MAX); + + mPhases.assign(phases.begin(), phases.end()); + mSets.reserve(sets.size() + 1); + mSets.pushBack(0); // prefix with 0 + + mOriginalNumRestvalues = uint32_t(restvalues.size()); + + // padd indices for SIMD + const uint32_t* iBegin = indices.begin(), *iIt = iBegin; + const float* rBegin = restvalues.begin(), *rIt = rBegin; + const uint32_t* sIt, *sEnd = sets.end(); + for(sIt = sets.begin(); sIt != sEnd; ++sIt) + { + const float* rEnd = rBegin + *sIt; + const uint32_t* iEnd = iBegin + *sIt * 2; + uint32_t numConstraints = uint32_t(rEnd - rIt); + + for(; rIt != rEnd; ++rIt) + mRestvalues.pushBack(*rIt); + + for(; iIt != iEnd; ++iIt) + mIndices.pushBack(uint16_t(*iIt)); + + // add dummy indices to make multiple of 4 + for(; numConstraints &= kSimdWidth - 1; ++numConstraints) + { + mRestvalues.pushBack(-FLT_MAX); + uint32_t index = mNumParticles + numConstraints - 1; + mIndices.pushBack(uint16_t(index)); + mIndices.pushBack(uint16_t(index)); + } + + mSets.pushBack(uint32_t(mRestvalues.size())); + } + + // trim overallocations + RestvalueContainer(mRestvalues.begin(), mRestvalues.end()).swap(mRestvalues); + Vector<uint16_t>::Type(mIndices.begin(), mIndices.end()).swap(mIndices); + + // tethers + PX_ASSERT(anchors.size() == tetherLengths.size()); + + // pad to allow for direct 16 byte (unaligned) loads + mTethers.reserve(anchors.size() + 2); + for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront()) + mTethers.pushBack(SwTether(uint16_t(anchors.front()), tetherLengths.front())); + + mFactory.mFabrics.pushBack(this); +} + +cloth::SwFabric::~SwFabric() +{ + Vector<SwFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this); + PX_ASSERT(fIt != mFactory.mFabrics.end()); + mFactory.mFabrics.replaceWithLast(fIt); +} + +cloth::Factory& cloth::SwFabric::getFactory() const +{ + return mFactory; +} + +uint32_t cloth::SwFabric::getNumPhases() const +{ + return uint32_t(mPhases.size()); +} + +uint32_t cloth::SwFabric::getNumRestvalues() const +{ + return mOriginalNumRestvalues; +} + +uint32_t cloth::SwFabric::getNumSets() const +{ + return uint32_t(mSets.size() - 1); +} + +uint32_t cloth::SwFabric::getNumIndices() const +{ + return 2 * mOriginalNumRestvalues; +} + +uint32_t cloth::SwFabric::getNumParticles() const +{ + return mNumParticles; +} + +uint32_t cloth::SwFabric::getNumTethers() const +{ + return uint32_t(mTethers.size()); +} + +void cloth::SwFabric::scaleRestvalues(float scale) +{ + RestvalueContainer::Iterator rIt, rEnd = mRestvalues.end(); + for(rIt = mRestvalues.begin(); rIt != rEnd; ++rIt) + *rIt *= scale; +} + +void cloth::SwFabric::scaleTetherLengths(float scale) +{ + mTetherLengthScale *= scale; +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h new file mode 100644 index 00000000..e2081866 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Allocator.h" +#include "Fabric.h" +#include "Types.h" +#include "Range.h" +#include "PxVec4.h" + +namespace nvidia +{ + +namespace cloth +{ + +class SwFactory; + +struct SwTether +{ + SwTether(uint16_t, float); + uint16_t mAnchor; + float mLength; +}; + +class SwFabric : public UserAllocated, public Fabric +{ + public: +#if PX_WINDOWS_FAMILY + typedef AlignedVector<float, 32>::Type RestvalueContainer; // avx +#else + typedef AlignedVector<float, 16>::Type RestvalueContainer; +#endif + + SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths, uint32_t id); + + SwFabric& operator=(const SwFabric&); + + virtual ~SwFabric(); + + virtual Factory& getFactory() const; + + virtual uint32_t getNumPhases() const; + virtual uint32_t getNumRestvalues() const; + + virtual uint32_t getNumSets() const; + virtual uint32_t getNumIndices() const; + + virtual uint32_t getNumParticles() const; + + virtual uint32_t getNumTethers() const; + + virtual void scaleRestvalues(float); + virtual void scaleTetherLengths(float); + + public: + SwFactory& mFactory; + + uint32_t mNumParticles; + + Vector<uint32_t>::Type mPhases; // index of set to use + Vector<uint32_t>::Type mSets; // offset of first restvalue, with 0 prefix + + RestvalueContainer mRestvalues; // rest values (edge length) + Vector<uint16_t>::Type mIndices; // particle index pairs + + Vector<SwTether>::Type mTethers; + float mTetherLengthScale; + + uint32_t mId; + + uint32_t mOriginalNumRestvalues; + +} PX_ALIGN_SUFFIX(16); +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp new file mode 100644 index 00000000..9955156d --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwFactory.h" +#include "SwFabric.h" +#include "SwCloth.h" +#include "SwSolver.h" +#include "ClothImpl.h" +#include <string.h> // for memcpy +#include "PsIntrinsics.h" + +using namespace nvidia; +using namespace nvidia; + +namespace nvidia +{ +namespace cloth +{ +// defined in Factory.cpp +uint32_t getNextFabricId(); +} +} + +cloth::SwFactory::SwFactory() : Factory(CPU) +{ +} + +cloth::SwFactory::~SwFactory() +{ +} + +cloth::Fabric* cloth::SwFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, + Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths) +{ + return new SwFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths, + getNextFabricId()); +} + +cloth::Cloth* cloth::SwFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric) +{ + return new SwClothImpl(*this, fabric, particles); +} + +cloth::Solver* cloth::SwFactory::createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr) +{ +#ifdef PX_PHYSX_GPU_EXPORTS + // SwSolver not defined in PhysXGpu project + PX_UNUSED(profiler); + PX_UNUSED(taskMgr); + return 0; +#else + return new SwSolver(profiler, taskMgr); +#endif +} + +cloth::Cloth* cloth::SwFactory::clone(const Cloth& cloth) +{ + if(cloth.getFactory().getPlatform() != Factory::CPU) + return cloth.clone(*this); // forward to CuCloth + + // copy construct + return new SwClothImpl(*this, static_cast<const SwClothImpl&>(cloth)); +} + +void cloth::SwFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths) const +{ + const SwFabric& swFabric = static_cast<const SwFabric&>(fabric); + + PX_ASSERT(phases.empty() || phases.size() == swFabric.getNumPhases()); + PX_ASSERT(restvalues.empty() || restvalues.size() == swFabric.getNumRestvalues()); + PX_ASSERT(sets.empty() || sets.size() == swFabric.getNumSets()); + PX_ASSERT(indices.empty() || indices.size() == swFabric.getNumIndices()); + PX_ASSERT(anchors.empty() || anchors.size() == swFabric.getNumTethers()); + PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == swFabric.getNumTethers()); + + for(uint32_t i = 0; !phases.empty(); ++i, phases.popFront()) + phases.front() = swFabric.mPhases[i]; + + const uint32_t* sEnd = swFabric.mSets.end(), *sIt; + const float* rBegin = swFabric.mRestvalues.begin(), *rIt = rBegin; + const uint16_t* iIt = swFabric.mIndices.begin(); + + uint32_t* sDst = sets.begin(); + float* rDst = restvalues.begin(); + uint32_t* iDst = indices.begin(); + + uint32_t numConstraints = 0; + for(sIt = swFabric.mSets.begin(); ++sIt != sEnd;) + { + const float* rEnd = rBegin + *sIt; + for(; rIt != rEnd; ++rIt) + { + uint16_t i0 = *iIt++; + uint16_t i1 = *iIt++; + + if(PxMax(i0, i1) >= swFabric.mNumParticles) + continue; + + if(!restvalues.empty()) + *rDst++ = *rIt; + + if(!indices.empty()) + { + *iDst++ = i0; + *iDst++ = i1; + } + + ++numConstraints; + } + + if(!sets.empty()) + *sDst++ = numConstraints; + } + + for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront()) + anchors.front() = swFabric.mTethers[i].mAnchor; + + for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront()) + tetherLengths.front() = swFabric.mTethers[i].mLength * swFabric.mTetherLengthScale; +} + +void cloth::SwFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + PX_ASSERT(spheres.empty() || spheres.size() == swCloth.mStartCollisionSpheres.size()); + PX_ASSERT(capsules.empty() || capsules.size() == swCloth.mCapsuleIndices.size() * 2); + PX_ASSERT(planes.empty() || planes.size() == swCloth.mStartCollisionPlanes.size()); + PX_ASSERT(convexes.empty() || convexes.size() == swCloth.mConvexMasks.size()); + PX_ASSERT(triangles.empty() || triangles.size() == swCloth.mStartCollisionTriangles.size()); + + if(!swCloth.mStartCollisionSpheres.empty() && !spheres.empty()) + memcpy(spheres.begin(), &swCloth.mStartCollisionSpheres.front(), + swCloth.mStartCollisionSpheres.size() * sizeof(PxVec4)); + + if(!swCloth.mCapsuleIndices.empty() && !capsules.empty()) + memcpy(capsules.begin(), &swCloth.mCapsuleIndices.front(), swCloth.mCapsuleIndices.size() * sizeof(IndexPair)); + + if(!swCloth.mStartCollisionPlanes.empty() && !planes.empty()) + memcpy(planes.begin(), &swCloth.mStartCollisionPlanes.front(), + swCloth.mStartCollisionPlanes.size() * sizeof(PxVec4)); + + if(!swCloth.mConvexMasks.empty() && !convexes.empty()) + memcpy(convexes.begin(), &swCloth.mConvexMasks.front(), swCloth.mConvexMasks.size() * sizeof(uint32_t)); + + if(!swCloth.mStartCollisionTriangles.empty() && !triangles.empty()) + memcpy(triangles.begin(), &swCloth.mStartCollisionTriangles.front(), + swCloth.mStartCollisionTriangles.size() * sizeof(PxVec3)); +} + +void cloth::SwFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + Vec4fAlignedVector const& srcConstraints = !swCloth.mMotionConstraints.mTarget.empty() + ? swCloth.mMotionConstraints.mTarget + : swCloth.mMotionConstraints.mStart; + + if(!srcConstraints.empty()) + { + // make sure dest array is big enough + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4)); + } +} + +void cloth::SwFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + Vec4fAlignedVector const& srcConstraints = !swCloth.mSeparationConstraints.mTarget.empty() + ? swCloth.mSeparationConstraints.mTarget + : swCloth.mSeparationConstraints.mStart; + + if(!srcConstraints.empty()) + { + // make sure dest array is big enough + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4)); + } +} + +void cloth::SwFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + if(!swCloth.mParticleAccelerations.empty()) + { + // make sure dest array is big enough + PX_ASSERT(destAccelerations.size() == swCloth.mParticleAccelerations.size()); + + memcpy(destAccelerations.begin(), &swCloth.mParticleAccelerations.front(), + swCloth.mParticleAccelerations.size() * sizeof(PxVec4)); + } +} + +void cloth::SwFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> indices, Range<PxVec3> weights) const +{ + PX_ASSERT(this == &cloth.getFactory()); + + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + + uint32_t numIndices = cloth.getNumVirtualParticles(); + uint32_t numWeights = cloth.getNumVirtualParticleWeights(); + + PX_ASSERT(indices.size() == numIndices || indices.empty()); + PX_ASSERT(weights.size() == numWeights || weights.empty()); + + if(weights.size() == numWeights) + { + PxVec3* wDestIt = reinterpret_cast<PxVec3*>(weights.begin()); + + // convert weights from vec4 to vec3 + cloth::Vec4fAlignedVector::ConstIterator wIt = swCloth.mVirtualParticleWeights.begin(); + cloth::Vec4fAlignedVector::ConstIterator wEnd = wIt + numWeights; + + for(; wIt != wEnd; ++wIt, ++wDestIt) + *wDestIt = PxVec3(wIt->x, wIt->y, wIt->z); + + PX_ASSERT(wDestIt == weights.end()); + } + if(indices.size() == numIndices) + { + // convert indices + Vec4u* iDestIt = reinterpret_cast<Vec4u*>(indices.begin()); + Vector<Vec4us>::Type::ConstIterator iIt = swCloth.mVirtualParticleIndices.begin(); + Vector<Vec4us>::Type::ConstIterator iEnd = swCloth.mVirtualParticleIndices.end(); + + uint32_t numParticles = uint32_t(swCloth.mCurParticles.size()); + + for(; iIt != iEnd; ++iIt) + { + // skip dummy indices + if(iIt->x < numParticles) + // byte offset to element index + *iDestIt++ = Vec4u(*iIt); + } + + PX_ASSERT(&array(*iDestIt) == indices.end()); + } +} + +void cloth::SwFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const +{ + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + PX_ASSERT(destIndices.size() == swCloth.mSelfCollisionIndices.size()); + intrinsics::memCopy(destIndices.begin(), swCloth.mSelfCollisionIndices.begin(), destIndices.size() * sizeof(uint32_t)); +} + +void cloth::SwFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const +{ + const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth; + PX_ASSERT(destRestPositions.size() == swCloth.mRestPositions.size()); + intrinsics::memCopy(destRestPositions.begin(), swCloth.mRestPositions.begin(), destRestPositions.size() * sizeof(PxVec4)); +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h new file mode 100644 index 00000000..a078add0 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Factory.h" +#include "Allocator.h" + +namespace nvidia +{ + +namespace cloth +{ + +class SwFabric; +class SwCloth; +template <typename> +class ClothImpl; + +class SwFactory : public UserAllocated, public Factory +{ + public: + typedef SwFabric FabricType; + typedef ClothImpl<SwCloth> ImplType; + + SwFactory(); + virtual ~SwFactory(); + + virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths); + + virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric); + + virtual Solver* createSolver(profile::PxProfileZone*, PxTaskManager*); + + virtual Cloth* clone(const Cloth& cloth); + + virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths) const; + + virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const; + + virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const; + + virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const; + + virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const; + + virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const; + + public: + Vector<SwFabric*>::Type mFabrics; +}; +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp new file mode 100644 index 00000000..c2c924cf --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp @@ -0,0 +1,694 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwInterCollision.h" +#include "PsIntrinsics.h" +#include "SwCollision.h" //temp fix, needed by SwCollisionHelper implementations +#include "Simd4f.h" +#include "SwCollisionHelpers.h" +#include "BoundingBox.h" +#include "PsSort.h" +#include "PsIntrinsics.h" + +#pragma warning(disable:4127) + +using namespace nvidia; + +namespace +{ +typedef Simd4fFactory<detail::FourTuple> Simd4fConstant; + +const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); +const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0)); +const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON); +const Simd4fConstant sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); + +// returns sorted indices, output needs to be at least 2*(last-first)+1024 +void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out) +{ + uint32_t n = uint32_t(last - first); + + uint32_t* buffer = out + 2 * n; + uint32_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 }; + + intrinsics::memZero(buffer, 1024 * sizeof(uint32_t)); + + // build 3 histograms in one pass + for(const uint32_t* __restrict it = first; it != last; ++it) + { + uint32_t key = *it; + ++histograms[0][0xff & key]; + ++histograms[1][0xff & (key >> 8)]; + ++histograms[2][0xff & (key >> 16)]; + ++histograms[3][key >> 24]; + } + + // convert histograms to offset tables in-place + uint32_t sums[4] = {}; + for(uint32_t i = 0; i < 256; ++i) + { + uint32_t temp0 = histograms[0][i] + sums[0]; + histograms[0][i] = sums[0], sums[0] = temp0; + + uint32_t temp1 = histograms[1][i] + sums[1]; + histograms[1][i] = sums[1], sums[1] = temp1; + + uint32_t temp2 = histograms[2][i] + sums[2]; + histograms[2][i] = sums[2], sums[2] = temp2; + + uint32_t temp3 = histograms[3][i] + sums[3]; + histograms[3][i] = sums[3], sums[3] = temp3; + } + + PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n); + +#if PX_DEBUG + memset(out, 0xff, 2 * n * sizeof(uint32_t)); +#endif + + // sort 8 bits per pass + + uint32_t* __restrict indices[] = { out, out + n }; + + for(uint32_t i = 0; i != n; ++i) + indices[1][histograms[0][0xff & first[i]]++] = i; + + for(uint32_t i = 0, index; index = indices[1][i], i != n; ++i) + indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index; + + for(uint32_t i = 0, index; index = indices[0][i], i != n; ++i) + indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index; + + for(uint32_t i = 0, index; index = indices[1][i], i != n; ++i) + indices[0][histograms[3][first[index] >> 24]++] = index; +} + +template <typename Simd4f> +uint32_t longestAxis(const Simd4f& edgeLength) +{ + const float* e = array(edgeLength); + + if(e[0] > e[1]) + return uint32_t(e[0] > e[2] ? 0 : 2); + else + return uint32_t(e[1] > e[2] ? 1 : 2); +} +} + +template <typename Simd4f> +cloth::SwInterCollision<Simd4f>::SwInterCollision(const cloth::SwInterCollisionData* instances, uint32_t n, float colDist, + float stiffness, uint32_t iterations, InterCollisionFilter filter, + cloth::SwKernelAllocator& alloc, profile::PxProfileZone* zone) +: mInstances(instances) +, mNumInstances(n) +, mClothIndices(NULL) +, mParticleIndices(NULL) +, mNumParticles(0) +, mTotalParticles(0) +, mFilter(filter) +, mAllocator(alloc) +, mProfiler(zone) +{ + PX_ASSERT(mFilter); + + mCollisionDistance = simd4f(colDist, colDist, colDist, 0.0f); + mCollisionSquareDistance = mCollisionDistance * mCollisionDistance; + mStiffness = simd4f(stiffness); + mNumIterations = iterations; + + // calculate particle size + for(uint32_t i = 0; i < n; ++i) + mTotalParticles += instances[i].mNumParticles; +} + +template <typename Simd4f> +cloth::SwInterCollision<Simd4f>::~SwInterCollision() +{ +} + +namespace +{ +// multiple x by m leaving w component of x intact +template <typename Simd4f> +PX_INLINE Simd4f transform(const Simd4f m[4], const Simd4f& x) +{ + const Simd4f a = m[3] + splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2]; + return select(sMaskXYZ, a, x); +} + +// rotate x by m leaving w component intact +template <typename Simd4f> +PX_INLINE Simd4f rotate(const Simd4f m[4], const Simd4f& x) +{ + const Simd4f a = splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2]; + return select(sMaskXYZ, a, x); +} + +template <typename Simd4f> +struct ClothSorter +{ + typedef cloth::BoundingBox<Simd4f> BoundingBox; + + ClothSorter(BoundingBox* bounds, uint32_t n, uint32_t axis) : mBounds(bounds), mNumBounds(n), mAxis(axis) + { + } + + bool operator()(uint32_t i, uint32_t j) const + { + PX_ASSERT(i < mNumBounds); + PX_ASSERT(j < mNumBounds); + + return array(mBounds[i].mLower)[mAxis] < array(mBounds[j].mLower)[mAxis]; + } + + BoundingBox* mBounds; + uint32_t mNumBounds; + uint32_t mAxis; +}; + +// for the given cloth array this function calculates the set of particles +// which potentially interact, the potential colliders are returned with their +// cloth index and particle index in clothIndices and particleIndices, the +// function returns the number of potential colliders +template <typename Simd4f> +uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, const cloth::SwInterCollisionData* cEnd, + Simd4f colDist, uint16_t* clothIndices, uint32_t* particleIndices, + cloth::BoundingBox<Simd4f>& bounds, uint32_t* overlapMasks, + cloth::InterCollisionFilter filter, cloth::SwKernelAllocator& allocator) +{ + using namespace cloth; + + typedef BoundingBox<Simd4f> BoundingBox; + + uint32_t numParticles = 0; + const uint32_t numCloths = uint32_t(cEnd - cBegin); + + // bounds of each cloth objects in world space + BoundingBox* const clothBounds = (BoundingBox*)(allocator.allocate(numCloths * sizeof(BoundingBox))); + BoundingBox* const overlapBounds = (BoundingBox*)(allocator.allocate(numCloths * sizeof(BoundingBox))); + + // union of all cloth world bounds + BoundingBox totalClothBounds = emptyBounds<Simd4f>(); + + uint32_t* sortedIndices = (uint32_t*)allocator.allocate(numCloths * sizeof(uint32_t)); + + for(uint32_t i = 0; i < numCloths; ++i) + { + const SwInterCollisionData& c = cBegin[i]; + + // transform bounds from b local space to local space of a + PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0])); + PX_ASSERT(!lcBounds.isEmpty()); + PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose, lcBounds); + + BoundingBox cBounds = {(Simd4f)simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f), + (Simd4f)simd4f(cWorld.maximum.x, cWorld.maximum.y, cWorld.maximum.z, 0.0f) }; + + sortedIndices[i] = i; + clothBounds[i] = cBounds; + + totalClothBounds = expandBounds(totalClothBounds, cBounds); + } + + // sort indices by their minimum extent on the longest axis + const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower); + + ClothSorter<Simd4f> predicate(clothBounds, numCloths, sweepAxis); + nvidia::sort(sortedIndices, numCloths, predicate); + + for(uint32_t i = 0; i < numCloths; ++i) + { + PX_ASSERT(sortedIndices[i] < numCloths); + + const SwInterCollisionData& a = cBegin[sortedIndices[i]]; + + // local bounds + const Simd4f aCenter = load(reinterpret_cast<const float*>(&a.mBoundsCenter)); + const Simd4f aHalfExtent = load(reinterpret_cast<const float*>(&a.mBoundsHalfExtent)) + colDist; + const BoundingBox aBounds = { aCenter - aHalfExtent, aCenter + aHalfExtent }; + + const PxMat44 aToWorld(a.mGlobalPose); + const PxTransform aToLocal(a.mGlobalPose.getInverse()); + + const float axisMin = array(clothBounds[sortedIndices[i]].mLower)[sweepAxis]; + const float axisMax = array(clothBounds[sortedIndices[i]].mUpper)[sweepAxis]; + + uint32_t overlapMask = 0; + uint32_t numOverlaps = 0; + + // scan back to find first intersecting bounding box + uint32_t startIndex = i; + while(startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin) + --startIndex; + + // compute all overlapping bounds + for(uint32_t j = startIndex; j < numCloths; ++j) + { + // ignore self-collision + if(i == j) + continue; + + // early out if no more cloths along axis intersect us + if(array(clothBounds[sortedIndices[j]].mLower)[sweepAxis] > axisMax) + break; + + const SwInterCollisionData& b = cBegin[sortedIndices[j]]; + + // check if collision between these shapes is filtered + if(!filter(a.mUserData, b.mUserData)) + continue; + + // set mask bit for this cloth + overlapMask |= 1 << sortedIndices[j]; + + // transform bounds from b local space to local space of a + PxBounds3 lcBounds = + PxBounds3::centerExtents(b.mBoundsCenter, b.mBoundsHalfExtent + PxVec3(array(colDist)[0])); + PX_ASSERT(!lcBounds.isEmpty()); + PxBounds3 bLocal = PxBounds3::transformFast(aToLocal * b.mGlobalPose, lcBounds); + + BoundingBox bBounds = {(Simd4f)simd4f(bLocal.minimum.x, bLocal.minimum.y, bLocal.minimum.z, 0.0f), + (Simd4f)simd4f(bLocal.maximum.x, bLocal.maximum.y, bLocal.maximum.z, 0.0f) }; + + BoundingBox iBounds = intersectBounds(aBounds, bBounds); + + // setup bounding box w to make point containment test cheaper + iBounds.mLower = (iBounds.mLower & sMaskXYZ) | ((Simd4f)sMaskW & simd4f(-FLT_MAX)); + iBounds.mUpper = (iBounds.mUpper & sMaskXYZ) | ((Simd4f)sMaskW & simd4f(FLT_MAX)); + + if(!isEmptyBounds(iBounds)) + overlapBounds[numOverlaps++] = iBounds; + } + + //---------------------------------------------------------------- + // cull all particles to overlapping bounds and transform particles to world space + + const uint32_t clothIndex = sortedIndices[i]; + overlapMasks[clothIndex] = overlapMask; + + Simd4f* pBegin = reinterpret_cast<Simd4f*>(a.mParticles); + Simd4f* qBegin = reinterpret_cast<Simd4f*>(a.mPrevParticles); + + const Simd4f xform[4] = { load(reinterpret_cast<const float*>(&aToWorld.column0)), + load(reinterpret_cast<const float*>(&aToWorld.column1)), + load(reinterpret_cast<const float*>(&aToWorld.column2)), + load(reinterpret_cast<const float*>(&aToWorld.column3)) }; + + Simd4f impulseInvScale = recip(Simd4f(simd4f(cBegin[clothIndex].mImpulseScale))); + + for(uint32_t k = 0; k < a.mNumParticles; ++k) + { + Simd4f* pIt = a.mIndices ? pBegin + a.mIndices[k] : pBegin + k; + Simd4f* qIt = a.mIndices ? qBegin + a.mIndices[k] : qBegin + k; + + const Simd4f p = *pIt; + + for(const BoundingBox* oIt = overlapBounds, *oEnd = overlapBounds + numOverlaps; oIt != oEnd; ++oIt) + { + // point in box test + if(anyGreater(oIt->mLower, p) != 0) + continue; + if(anyGreater(p, oIt->mUpper) != 0) + continue; + + // transform particle to world space in-place + // (will be transformed back after collision) + *pIt = transform(xform, p); + + Simd4f impulse = (p - *qIt) * impulseInvScale; + *qIt = rotate(xform, impulse); + + // update world bounds + bounds = expandBounds(bounds, pIt, pIt + 1); + + // add particle to output arrays + clothIndices[numParticles] = uint16_t(clothIndex); + particleIndices[numParticles] = uint32_t(pIt - pBegin); + + // output each particle only once + ++numParticles; + break; + } + } + } + + allocator.deallocate(sortedIndices); + allocator.deallocate(overlapBounds); + allocator.deallocate(clothBounds); + + return numParticles; +} +} + +template <typename Simd4f> +PX_INLINE Simd4f& cloth::SwInterCollision<Simd4f>::getParticle(uint32_t index) +{ + PX_ASSERT(index < mNumParticles); + + uint16_t clothIndex = mClothIndices[index]; + uint32_t particleIndex = mParticleIndices[index]; + + PX_ASSERT(clothIndex < mNumInstances); + + return reinterpret_cast<Simd4f&>(mInstances[clothIndex].mParticles[particleIndex]); +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::operator()() +{ + mNumTests = mNumCollisions = 0; + + mClothIndices = static_cast<uint16_t*>(mAllocator.allocate(sizeof(uint16_t) * mTotalParticles)); + mParticleIndices = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t) * mTotalParticles)); + mOverlapMasks = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t*) * mNumInstances)); + + for(uint32_t k = 0; k < mNumIterations; ++k) + { + // world bounds of particles + BoundingBox<Simd4f> bounds = emptyBounds<Simd4f>(); + + // calculate potentially colliding set + { +#if PX_PROFILE + ProfileZone zone("cloth::SwInterCollision::BroadPhase", mProfiler); +#endif + + mNumParticles = + calculatePotentialColliders(mInstances, mInstances + mNumInstances, mCollisionDistance, mClothIndices, + mParticleIndices, bounds, mOverlapMasks, mFilter, mAllocator); + } + + // collide + if(mNumParticles) + { +#if PX_PROFILE + ProfileZone zone("cloth::SwInterCollision::Collide", mProfiler); +#endif + + Simd4f lowerBound = bounds.mLower; + Simd4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon); + + // sweep along longest axis + uint32_t sweepAxis = longestAxis(edgeLength); + uint32_t hashAxis0 = (sweepAxis + 1) % 3; + uint32_t hashAxis1 = (sweepAxis + 2) % 3; + + // reserve 0, 127, and 65535 for sentinel + Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength); + array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533; + + Simd4f one = simd4f(_1); + Simd4f gridSize = simd4f(254.0f); + array(gridSize)[sweepAxis] = 65534.0f; + + Simd4f gridScale = recipT<1>(cellSize); + Simd4f gridBias = -lowerBound * gridScale + simd4f(_1); + + void* buffer = mAllocator.allocate(getBufferSize(mNumParticles)); + + uint32_t* __restrict sortedIndices = reinterpret_cast<uint32_t*>(buffer); + uint32_t* __restrict sortedKeys = sortedIndices + mNumParticles; + uint32_t* __restrict keys = PxMax(sortedKeys + mNumParticles, sortedIndices + 2 * mNumParticles + 1024); + + typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i; + + // create keys + for(uint32_t i = 0; i < mNumParticles; ++i) + { + // grid coordinate + Simd4f indexf = getParticle(i) * gridScale + gridBias; + + // need to clamp index because shape collision potentially + // pushes particles outside of their original bounds + Simd4i indexi = intFloor(max(one, min(indexf, gridSize))); + + const int32_t* ptr = simdi::array(indexi); + keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24)); + } + + // compute sorted keys indices + radixSort(keys, keys + mNumParticles, sortedIndices); + + // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel) + uint32_t firstColumnSize = sortedIndices[2 * mNumParticles + 769]; + + // sort keys + for(uint32_t i = 0; i < mNumParticles; ++i) + sortedKeys[i] = keys[sortedIndices[i]]; + sortedKeys[mNumParticles] = uint32_t(-1); // sentinel + + // calculate the number of buckets we need to search forward + const Simd4i data = intFloor(gridScale * mCollisionDistance); + uint32_t collisionDistance = uint32_t(2 + simdi::array(data)[sweepAxis]); + + // collide particles + collideParticles(sortedKeys, firstColumnSize, sortedIndices, mNumParticles, collisionDistance); + + mAllocator.deallocate(buffer); + } + + /* + // verify against brute force (disable collision response when testing) + uint32_t numCollisions = mNumCollisions; + mNumCollisions = 0; + + for(uint32_t i = 0; i < mNumParticles; ++i) + for(uint32_t j = i+1; j < mNumParticles; ++j) + if (mOverlapMasks[mClothIndices[i]] & (1 << mClothIndices[j])) + collideParticles(getParticle(i), getParticle(j)); + + static uint32_t iter = 0; ++iter; + if(numCollisions != mNumCollisions) + printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions); + */ + + // transform back to local space + { +#if PX_PROFILE + ProfileZone zone("cloth::SwInterCollision::PostTransform", mProfiler); +#endif + Simd4f toLocal[4], impulseScale; + uint16_t lastCloth = uint16_t(0xffff); + + for(uint32_t i = 0; i < mNumParticles; ++i) + { + uint16_t clothIndex = mClothIndices[i]; + const SwInterCollisionData* instance = mInstances + clothIndex; + + // todo: could pre-compute these inverses + if(clothIndex != lastCloth) + { + const PxMat44 xform(instance->mGlobalPose.getInverse()); + + toLocal[0] = load(reinterpret_cast<const float*>(&xform.column0)); + toLocal[1] = load(reinterpret_cast<const float*>(&xform.column1)); + toLocal[2] = load(reinterpret_cast<const float*>(&xform.column2)); + toLocal[3] = load(reinterpret_cast<const float*>(&xform.column3)); + + impulseScale = simd4f(instance->mImpulseScale); + + lastCloth = mClothIndices[i]; + } + + uint32_t particleIndex = mParticleIndices[i]; + Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]); + Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]); + + particle = transform(toLocal, particle); + // avoid w becoming negative due to numerical inaccuracies + impulse = max(sZeroW, particle - rotate(toLocal, Simd4f(impulse * impulseScale))); + } + } + } + + mAllocator.deallocate(mOverlapMasks); + mAllocator.deallocate(mParticleIndices); + mAllocator.deallocate(mClothIndices); +} + +template <typename Simd4f> +size_t cloth::SwInterCollision<Simd4f>::estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n) +{ + // count total particles + uint32_t numParticles = 0; + for(uint32_t i = 0; i < n; ++i) + numParticles += cloths[i].mNumParticles; + + uint32_t boundsSize = 2 * n * sizeof(BoundingBox<Simd4f>) + n * sizeof(uint32_t); + uint32_t clothIndicesSize = numParticles * sizeof(uint16_t); + uint32_t particleIndicesSize = numParticles * sizeof(uint32_t); + uint32_t masksSize = n * sizeof(uint32_t); + + return boundsSize + clothIndicesSize + particleIndicesSize + masksSize + getBufferSize(numParticles); +} + +template <typename Simd4f> +size_t cloth::SwInterCollision<Simd4f>::getBufferSize(uint32_t numParticles) +{ + uint32_t keysSize = numParticles * sizeof(uint32_t); + uint32_t indicesSize = numParticles * sizeof(uint32_t); + uint32_t histogramSize = 1024 * sizeof(uint32_t); + + return keysSize + indicesSize + PxMax(indicesSize + histogramSize, keysSize); +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::collideParticle(uint32_t index) +{ + uint16_t clothIndex = mClothIndices[index]; + + if((1 << clothIndex) & ~mClothMask) + return; + + const SwInterCollisionData* instance = mInstances + clothIndex; + + uint32_t particleIndex = mParticleIndices[index]; + Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]); + + Simd4f diff = particle - mParticle; + Simd4f distSqr = dot3(diff, diff); + +#if PX_DEBUG + ++mNumTests; +#endif + + if(allGreater(distSqr, mCollisionSquareDistance)) + return; + + Simd4f w0 = splat<3>(mParticle); + Simd4f w1 = splat<3>(particle); + + Simd4f ratio = mCollisionDistance * rsqrtT<1>(distSqr); + Simd4f scale = mStiffness * recipT<1>(sEpsilon + w0 + w1); + Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ; + + mParticle = mParticle + delta * w0; + particle = particle - delta * w1; + + Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]); + + mImpulse = mImpulse + delta * w0; + impulse = impulse - delta * w1; + +#if PX_DEBUG || PX_PROFILE + ++mNumCollisions; +#endif +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::collideParticles(const uint32_t* keys, uint32_t firstColumnSize, + const uint32_t* indices, uint32_t numParticles, + uint32_t collisionDistance) +{ + const uint32_t bucketMask = uint16_t(-1); + + const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 }; + + const uint32_t* __restrict kFirst[5]; + const uint32_t* __restrict kLast[5]; + + { + // optimization: scan forward iterator starting points once instead of 9 times + const uint32_t* __restrict kIt = keys; + + uint32_t key = *kIt; + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + kFirst[0] = kIt; + while(*kIt < lastKey) + ++kIt; + kLast[0] = kIt; + + for(uint32_t k = 1; k < 5; ++k) + { + for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kFirst[k] = kIt; + + for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kLast[k] = kIt; + + // jump forward once to second column + kIt = keys + firstColumnSize; + firstColumnSize = 0; + } + } + + const uint32_t* __restrict iIt = indices; + const uint32_t* __restrict iEnd = indices + numParticles; + + const uint32_t* __restrict jIt; + const uint32_t* __restrict jEnd; + + for(; iIt != iEnd; ++iIt, ++kFirst[0]) + { + // load current particle once outside of inner loop + uint32_t index = *iIt; + PX_ASSERT(index < mNumParticles); + mClothIndex = mClothIndices[index]; + PX_ASSERT(mClothIndex < mNumInstances); + mClothMask = mOverlapMasks[mClothIndex]; + + const SwInterCollisionData* instance = mInstances + mClothIndex; + + mParticleIndex = mParticleIndices[index]; + mParticle = reinterpret_cast<const Simd4f&>(instance->mParticles[mParticleIndex]); + mImpulse = reinterpret_cast<const Simd4f&>(instance->mPrevParticles[mParticleIndex]); + + uint32_t key = *kFirst[0]; + + // range of keys we need to check against for this particle + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + // scan forward end point + while(*kLast[0] < lastKey) + ++kLast[0]; + + // process potential colliders of same cell + jEnd = indices + (kLast[0] - keys); + for(jIt = iIt + 1; jIt != jEnd; ++jIt) + collideParticle(*jIt); + + // process neighbor cells + for(uint32_t k = 1; k < 5; ++k) + { + // scan forward start point + for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;) + ++kFirst[k]; + + // scan forward end point + for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;) + ++kLast[k]; + + // process potential colliders + jEnd = indices + (kLast[k] - keys); + for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt) + collideParticle(*jIt); + } + + // write back particle and impulse + reinterpret_cast<Simd4f&>(instance->mParticles[mParticleIndex]) = mParticle; + reinterpret_cast<Simd4f&>(instance->mPrevParticles[mParticleIndex]) = mImpulse; + } +} + +// explicit template instantiation +#if NVMATH_SIMD +template class cloth::SwInterCollision<Simd4f>; +#endif +#if NVMATH_SCALAR +template class cloth::SwInterCollision<Scalar4f>; +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h new file mode 100644 index 00000000..ffc62eb1 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +#include "StackAllocator.h" + +#include "Simd4i.h" + +#include "PxMat44.h" +#include "PxTransform.h" +#include "PxBounds3.h" + +namespace physx +{ + namespace profile + { + class PxProfileZone; + } +} + +namespace nvidia +{ +namespace cloth +{ + +class SwCloth; +struct SwClothData; + +typedef StackAllocator<16> SwKernelAllocator; + +typedef bool (*InterCollisionFilter)(void* cloth0, void* cloth1); + +struct SwInterCollisionData +{ + SwInterCollisionData() + { + } + SwInterCollisionData(PxVec4* particles, PxVec4* prevParticles, uint32_t numParticles, uint32_t* indices, + const PxTransform& globalPose, const PxVec3& boundsCenter, const PxVec3& boundsHalfExtents, + float impulseScale, void* userData) + : mParticles(particles) + , mPrevParticles(prevParticles) + , mNumParticles(numParticles) + , mIndices(indices) + , mGlobalPose(globalPose) + , mBoundsCenter(boundsCenter) + , mBoundsHalfExtent(boundsHalfExtents) + , mImpulseScale(impulseScale) + , mUserData(userData) + { + } + + PxVec4* mParticles; + PxVec4* mPrevParticles; + uint32_t mNumParticles; + uint32_t* mIndices; + PxTransform mGlobalPose; + PxVec3 mBoundsCenter; + PxVec3 mBoundsHalfExtent; + float mImpulseScale; + void* mUserData; +}; + +template <typename Simd4f> +class SwInterCollision +{ + + public: + SwInterCollision(const SwInterCollisionData* cloths, uint32_t n, float colDist, float stiffness, uint32_t iterations, + InterCollisionFilter filter, cloth::SwKernelAllocator& alloc, nvidia::profile::PxProfileZone* zone); + + ~SwInterCollision(); + + void operator()(); + + static size_t estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n); + + private: + SwInterCollision& operator=(const SwInterCollision&); // not implemented + + static size_t getBufferSize(uint32_t); + + void collideParticles(const uint32_t* keys, uint32_t firstColumnSize, const uint32_t* sortedIndices, + uint32_t numParticles, uint32_t collisionDistance); + + Simd4f& getParticle(uint32_t index); + + // better wrap these in a struct + void collideParticle(uint32_t index); + + Simd4f mParticle; + Simd4f mImpulse; + + Simd4f mCollisionDistance; + Simd4f mCollisionSquareDistance; + Simd4f mStiffness; + + uint16_t mClothIndex; + uint32_t mClothMask; + uint32_t mParticleIndex; + + uint32_t mNumIterations; + + const SwInterCollisionData* mInstances; + uint32_t mNumInstances; + + uint16_t* mClothIndices; + uint32_t* mParticleIndices; + uint32_t mNumParticles; + uint32_t* mOverlapMasks; + + uint32_t mTotalParticles; + + InterCollisionFilter mFilter; + + SwKernelAllocator& mAllocator; + + profile::PxProfileZone* mProfiler; + + public: + mutable uint32_t mNumTests; + mutable uint32_t mNumCollisions; +}; + +} // namespace cloth + +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp new file mode 100644 index 00000000..939543f4 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwSelfCollision.h" +#include "SwCloth.h" +#include "SwClothData.h" +#include "PsIntrinsics.h" +#include "SwCollision.h" //temp fix, needed by SwCollisionHelper implementaitons +#include "Simd4f.h" +#include "SwCollisionHelpers.h" + +#pragma warning(disable:4127) + +using namespace nvidia; +using namespace nvidia; + +namespace +{ +typedef Simd4fFactory<detail::FourTuple> Simd4fConstant; + +const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); +const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON); + +// returns sorted indices, output needs to be at least 2*(last-first)+1024 +void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out) +{ + uint16_t n = uint16_t(last - first); + + uint16_t* buffer = out + 2 * n; + uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 }; + + intrinsics::memZero(buffer, 1024 * sizeof(uint16_t)); + + // build 3 histograms in one pass + for(const uint32_t* __restrict it = first; it != last; ++it) + { + uint32_t key = *it; + ++histograms[0][0xff & key]; + ++histograms[1][0xff & (key >> 8)]; + ++histograms[2][0xff & (key >> 16)]; + ++histograms[3][key >> 24]; + } + + // convert histograms to offset tables in-place + uint16_t sums[4] = {}; + for(uint32_t i = 0; i < 256; ++i) + { + uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]); + histograms[0][i] = sums[0], sums[0] = temp0; + + uint16_t temp1 = uint16_t(histograms[1][i] + sums[1]); + histograms[1][i] = sums[1], sums[1] = temp1; + + uint16_t temp2 = uint16_t(histograms[2][i] + sums[2]); + histograms[2][i] = sums[2], sums[2] = temp2; + + uint16_t temp3 = uint16_t(histograms[3][i] + sums[3]); + histograms[3][i] = sums[3], sums[3] = temp3; + } + + PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n); + +#if PX_DEBUG + memset(out, 0xff, 2 * n * sizeof(uint16_t)); +#endif + + // sort 8 bits per pass + + uint16_t* __restrict indices[] = { out, out + n }; + + for(uint16_t i = 0; i != n; ++i) + indices[1][histograms[0][0xff & first[i]]++] = i; + + for(uint16_t i = 0, index; index = indices[1][i], i != n; ++i) + indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index; + + for(uint16_t i = 0, index; index = indices[0][i], i != n; ++i) + indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index; + + for(uint16_t i = 0, index; index = indices[1][i], i != n; ++i) + indices[0][histograms[3][first[index] >> 24]++] = index; +} + +template <typename Simd4f> +uint32_t longestAxis(const Simd4f& edgeLength) +{ + const float* e = array(edgeLength); + + if(e[0] > e[1]) + return uint32_t(e[0] > e[2] ? 0 : 2); + else + return uint32_t(e[1] > e[2] ? 1 : 2); +} + +bool isSelfCollisionEnabled(const cloth::SwClothData& cloth) +{ + return PxMin(cloth.mSelfCollisionDistance, cloth.mSelfCollisionStiffness) > 0.0f; +} + +bool isSelfCollisionEnabled(const cloth::SwCloth& cloth) +{ + return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f; +} + +inline uint32_t align2(uint32_t x) +{ + return (x + 1) & ~1; +} + +} // anonymous namespace + +template <typename Simd4f> +cloth::SwSelfCollision<Simd4f>::SwSelfCollision(cloth::SwClothData& clothData, cloth::SwKernelAllocator& alloc) +: mClothData(clothData), mAllocator(alloc) +{ + mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance); + mCollisionSquareDistance = mCollisionDistance * mCollisionDistance; + mStiffness = (Simd4f)sMaskXYZ & simd4f(mClothData.mSelfCollisionStiffness); +} + +template <typename Simd4f> +cloth::SwSelfCollision<Simd4f>::~SwSelfCollision() +{ +} + +template <typename Simd4f> +void cloth::SwSelfCollision<Simd4f>::operator()() +{ + mNumTests = mNumCollisions = 0; + + if(!isSelfCollisionEnabled(mClothData)) + return; + + Simd4f lowerBound = load(mClothData.mCurBounds); + Simd4f edgeLength = max(load(mClothData.mCurBounds + 3) - lowerBound, sEpsilon); + + // sweep along longest axis + uint32_t sweepAxis = longestAxis(edgeLength); + uint32_t hashAxis0 = (sweepAxis + 1) % 3; + uint32_t hashAxis1 = (sweepAxis + 2) % 3; + + // reserve 0, 127, and 65535 for sentinel + Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength); + array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533; + + Simd4f one = simd4f(_1); + Simd4f gridSize = simd4f(254.0f); + array(gridSize)[sweepAxis] = 65534.0f; + + Simd4f gridScale = recipT<1>(cellSize); + Simd4f gridBias = -lowerBound * gridScale + simd4f(_1); + + uint32_t numIndices = mClothData.mNumSelfCollisionIndices; + void* buffer = mAllocator.allocate(getBufferSize(numIndices)); + + const uint32_t* __restrict indices = mClothData.mSelfCollisionIndices; + uint32_t* __restrict keys = reinterpret_cast<uint32_t*>(buffer); + uint16_t* __restrict sortedIndices = reinterpret_cast<uint16_t*>(keys + numIndices); + uint32_t* __restrict sortedKeys = reinterpret_cast<uint32_t*>(sortedIndices + align2(numIndices)); + + const Simd4f* particles = reinterpret_cast<const Simd4f*>(mClothData.mCurParticles); + + // create keys + for(uint32_t i = 0; i < numIndices; ++i) + { + uint32_t index = indices ? indices[i] : i; + + // grid coordinate + Simd4f keyf = particles[index] * gridScale + gridBias; + + // need to clamp index because shape collision potentially + // pushes particles outside of their original bounds + Simd4i keyi = intFloor(max(one, min(keyf, gridSize))); + + const int32_t* ptr = simdi::array(keyi); + keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24)); + } + + // compute sorted keys indices + radixSort(keys, keys + numIndices, sortedIndices); + + // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel) + uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769]; + + // sort keys + for(uint32_t i = 0; i < numIndices; ++i) + sortedKeys[i] = keys[sortedIndices[i]]; + sortedKeys[numIndices] = uint32_t(-1); // sentinel + + if(indices) + { + // sort indices (into no-longer-needed keys array) + const uint16_t* __restrict permutation = sortedIndices; + sortedIndices = reinterpret_cast<uint16_t*>(keys); + for(uint32_t i = 0; i < numIndices; ++i) + sortedIndices[i] = uint16_t(indices[permutation[i]]); + } + + // calculate the number of buckets we need to search forward + const Simd4i data = intFloor(gridScale * mCollisionDistance); + uint32_t collisionDistance = 2 + (uint32_t)simdi::array(data)[sweepAxis]; + + // collide particles + if(mClothData.mRestPositions) + collideParticles<true>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance); + else + collideParticles<false>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance); + + mAllocator.deallocate(buffer); + + // verify against brute force (disable collision response when testing) + /* + uint32_t numCollisions = mNumCollisions; + mNumCollisions = 0; + + Simd4f* qarticles = reinterpret_cast< + Simd4f*>(mClothData.mCurParticles); + for(uint32_t i = 0; i < numIndices; ++i) + { + uint32_t indexI = indices ? indices[i] : i; + for(uint32_t j = i+1; j < numIndices; ++j) + { + uint32_t indexJ = indices ? indices[j] : j; + collideParticles(qarticles[indexI], qarticles[indexJ]); + } + } + + static uint32_t iter = 0; ++iter; + if(numCollisions != mNumCollisions) + printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions); + */ +} + +template <typename Simd4f> +size_t cloth::SwSelfCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth) +{ + uint32_t numIndices = + cloth.mSelfCollisionIndices.empty() ? cloth.mCurParticles.size() : cloth.mSelfCollisionIndices.size(); + return isSelfCollisionEnabled(cloth) ? getBufferSize(numIndices) : 0; +} + +template <typename Simd4f> +size_t cloth::SwSelfCollision<Simd4f>::getBufferSize(uint32_t numIndices) +{ + uint32_t keysSize = numIndices * sizeof(uint32_t); + uint32_t indicesSize = align2(numIndices) * sizeof(uint16_t); + uint32_t radixSize = (numIndices + 1024) * sizeof(uint16_t); + return keysSize + indicesSize + PxMax(radixSize, keysSize + uint32_t(sizeof(uint32_t))); +} + +template <typename Simd4f> +template <bool useRestParticles> +void cloth::SwSelfCollision<Simd4f>::collideParticles(Simd4f& pos0, Simd4f& pos1, const Simd4f& pos0rest, + const Simd4f& pos1rest) +{ + Simd4f diff = pos1 - pos0; + Simd4f distSqr = dot3(diff, diff); + +#if PX_DEBUG + ++mNumTests; +#endif + + if(allGreater(distSqr, mCollisionSquareDistance)) + return; + + if(useRestParticles) + { + // calculate distance in rest configuration, if less than collision + // distance then ignore collision between particles in deformed config + Simd4f restDiff = pos1rest - pos0rest; + Simd4f restDistSqr = dot3(restDiff, restDiff); + + if(allGreater(mCollisionSquareDistance, restDistSqr)) + return; + } + + Simd4f w0 = splat<3>(pos0); + Simd4f w1 = splat<3>(pos1); + + Simd4f ratio = mCollisionDistance * rsqrt(distSqr); + Simd4f scale = mStiffness * recip(sEpsilon + w0 + w1); + Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ; + + pos0 = pos0 + delta * w0; + pos1 = pos1 - delta * w1; + +#if PX_DEBUG || PX_PROFILE + ++mNumCollisions; +#endif +} + +template <typename Simd4f> +template <bool useRestParticles> +void cloth::SwSelfCollision<Simd4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize, + const uint16_t* indices, uint32_t collisionDistance) +{ + Simd4f* __restrict particles = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* __restrict restParticles = + useRestParticles ? reinterpret_cast<Simd4f*>(mClothData.mRestPositions) : particles; + + const uint32_t bucketMask = uint16_t(-1); + + const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 }; + + const uint32_t* __restrict kFirst[5]; + const uint32_t* __restrict kLast[5]; + + { + // optimization: scan forward iterator starting points once instead of 9 times + const uint32_t* __restrict kIt = keys; + + uint32_t key = *kIt; + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + kFirst[0] = kIt; + while(*kIt < lastKey) + ++kIt; + kLast[0] = kIt; + + for(uint32_t k = 1; k < 5; ++k) + { + for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kFirst[k] = kIt; + + for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kLast[k] = kIt; + + // jump forward once to second column + kIt = keys + firstColumnSize; + firstColumnSize = 0; + } + } + + const uint16_t* __restrict iIt = indices; + const uint16_t* __restrict iEnd = indices + mClothData.mNumSelfCollisionIndices; + + const uint16_t* __restrict jIt; + const uint16_t* __restrict jEnd; + + for(; iIt != iEnd; ++iIt, ++kFirst[0]) + { + PX_ASSERT(*iIt < mClothData.mNumParticles); + + // load current particle once outside of inner loop + Simd4f particle = particles[*iIt]; + Simd4f restParticle = restParticles[*iIt]; + + uint32_t key = *kFirst[0]; + + // range of keys we need to check against for this particle + uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask); + uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask); + + // scan forward end point + while(*kLast[0] < lastKey) + ++kLast[0]; + + // process potential colliders of same cell + jEnd = indices + (kLast[0] - keys); + for(jIt = iIt + 1; jIt != jEnd; ++jIt) + collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]); + + // process neighbor cells + for(uint32_t k = 1; k < 5; ++k) + { + // scan forward start point + for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;) + ++kFirst[k]; + + // scan forward end point + for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;) + ++kLast[k]; + + // process potential colliders + jEnd = indices + (kLast[k] - keys); + for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt) + collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]); + } + + // store current particle + particles[*iIt] = particle; + } +} + +// explicit template instantiation +#if NVMATH_SIMD +template class cloth::SwSelfCollision<Simd4f>; +#endif +#if NVMATH_SCALAR +template class cloth::SwSelfCollision<Scalar4f>; +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h new file mode 100644 index 00000000..fa023e56 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "StackAllocator.h" +#include "Simd4i.h" + +namespace nvidia +{ + +namespace cloth +{ + +class SwCloth; +struct SwClothData; + +typedef StackAllocator<16> SwKernelAllocator; + +template <typename Simd4f> +class SwSelfCollision +{ + typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i; + + public: + SwSelfCollision(SwClothData& clothData, SwKernelAllocator& alloc); + ~SwSelfCollision(); + + void operator()(); + + static size_t estimateTemporaryMemory(const SwCloth&); + + private: + SwSelfCollision& operator=(const SwSelfCollision&); // not implemented + static size_t getBufferSize(uint32_t); + + template <bool useRestParticles> + void collideParticles(Simd4f&, Simd4f&, const Simd4f&, const Simd4f&); + + template <bool useRestParticles> + void collideParticles(const uint32_t*, uint16_t, const uint16_t*, uint32_t); + + Simd4f mCollisionDistance; + Simd4f mCollisionSquareDistance; + Simd4f mStiffness; + + SwClothData& mClothData; + SwKernelAllocator& mAllocator; + + public: + mutable uint32_t mNumTests; + mutable uint32_t mNumCollisions; +}; + +} // namespace cloth + +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp new file mode 100644 index 00000000..35cb1bde --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwSolver.h" +#include "SwCloth.h" +#include "ClothImpl.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "SwClothData.h" +#include "SwSolverKernel.h" +#include "SwInterCollision.h" +#include "IterationState.h" +#include "PxCpuDispatcher.h" +#include "PxProfileZone.h" +#include "PsFPU.h" +#include "PsSort.h" + +namespace nvidia +{ +namespace cloth +{ +bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&, profile::PxProfileZone*); +} +} + +#if NVMATH_SIMD +typedef Simd4f Simd4fType; +#else +typedef Scalar4f Simd4fType; +#endif + +using namespace nvidia; + +cloth::SwSolver::SwSolver(nvidia::profile::PxProfileZone* profiler, PxTaskManager* taskMgr) +: mProfiler(profiler) +, mSimulateEventId(mProfiler ? mProfiler->getEventIdForName("cloth::SwSolver::simulate") : uint16_t(-1)) +#if APEX_UE4 +, mDt(0.0f) +#endif +, mInterCollisionDistance(0.0f) +, mInterCollisionStiffness(1.0f) +, mInterCollisionIterations(1) +, mInterCollisionScratchMem(NULL) +, mInterCollisionScratchMemSize(0) +{ + mStartSimulationTask.mSolver = this; + mEndSimulationTask.mSolver = this; + + PX_UNUSED(taskMgr); +} + +cloth::SwSolver::~SwSolver() +{ + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); + + PX_ASSERT(mCpuClothSimulationTasks.empty()); +} + +namespace +{ +template <typename T> +bool clothSizeGreater(const T& t0, const T& t1) +{ +#if APEX_UE4 + return t0->mCloth->mCurParticles.size() > t1->mCloth->mCurParticles.size(); +#else + return t0.mCloth->mCurParticles.size() > t1.mCloth->mCurParticles.size(); +#endif +} + +template <typename T> +void sortTasks(nvidia::Array<T, nvidia::NonTrackingAllocator>& tasks) +{ + nvidia::sort(tasks.begin(), tasks.size(), &clothSizeGreater<T>); +} +} + +void cloth::SwSolver::addCloth(Cloth* cloth) +{ + SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth; + +#if APEX_UE4 + mCpuClothSimulationTasks.pushBack(new CpuClothSimulationTask(swCloth, *this)); +#else + mCpuClothSimulationTasks.pushBack(CpuClothSimulationTask(swCloth, mEndSimulationTask)); +#endif + + sortTasks(mCpuClothSimulationTasks); +} + +void cloth::SwSolver::removeCloth(Cloth* cloth) +{ + SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth; + + CpuClothSimulationTaskVector::Iterator tIt = mCpuClothSimulationTasks.begin(); + CpuClothSimulationTaskVector::Iterator tEnd = mCpuClothSimulationTasks.end(); + + while (tIt != tEnd && +#if APEX_UE4 + (*tIt)->mCloth != &swCloth +#else + tIt->mCloth != &swCloth +#endif + ) + ++tIt; + + if(tIt != tEnd) + { +#if APEX_UE4 + delete *tIt; +#else + deallocate(tIt->mScratchMemory); +#endif + mCpuClothSimulationTasks.replaceWithLast(tIt); + sortTasks(mCpuClothSimulationTasks); + } +} + +PxBaseTask& cloth::SwSolver::simulate(float dt, PxBaseTask& continuation) +{ + if (mCpuClothSimulationTasks.empty() +#if APEX_UE4 + || dt == 0.0f +#endif + ) + { + continuation.addReference(); + return continuation; + } + + mEndSimulationTask.setContinuation(&continuation); +#if APEX_UE4 + mDt = dt; +#else + mEndSimulationTask.mDt = dt; +#endif + + mStartSimulationTask.setContinuation(&mEndSimulationTask); + + mEndSimulationTask.removeReference(); + + return mStartSimulationTask; +} + +void cloth::SwSolver::interCollision() +{ + if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f) + return; + + float elasticity = 1.0f; + + // rebuild cloth instance array + mInterCollisionInstances.resize(0); + for(uint32_t i = 0; i < mCpuClothSimulationTasks.size(); ++i) + { +#if APEX_UE4 + SwCloth* c = mCpuClothSimulationTasks[i]->mCloth; + float invNumIterations = mCpuClothSimulationTasks[i]->mInvNumIterations; +#else + SwCloth* c = mCpuClothSimulationTasks[i].mCloth; + float invNumIterations = mCpuClothSimulationTasks[i].mInvNumIterations; +#endif + + mInterCollisionInstances.pushBack(SwInterCollisionData( + c->mCurParticles.begin(), c->mPrevParticles.begin(), + c->mSelfCollisionIndices.empty() ? c->mCurParticles.size() : c->mSelfCollisionIndices.size(), + c->mSelfCollisionIndices.empty() ? NULL : &c->mSelfCollisionIndices[0], c->mTargetMotion, + c->mParticleBoundsCenter, c->mParticleBoundsHalfExtent, elasticity * invNumIterations, c->mUserData)); + } + + const uint32_t requiredTempMemorySize = uint32_t(SwInterCollision<Simd4fType>::estimateTemporaryMemory( + &mInterCollisionInstances[0], mInterCollisionInstances.size())); + + // realloc temp memory if necessary + if(mInterCollisionScratchMemSize < requiredTempMemorySize) + { + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); + + mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem"); + mInterCollisionScratchMemSize = requiredTempMemorySize; + } + + SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize); + + // run inter-collision + SwInterCollision<Simd4fType> collider(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), + mInterCollisionDistance, mInterCollisionStiffness, mInterCollisionIterations, + mInterCollisionFilter, allocator, mProfiler); + + collider(); +} + +void cloth::SwSolver::beginFrame() const +{ + if(mProfiler) + mProfiler->startEvent(mSimulateEventId, uint64_t(intptr_t(this)), uint32_t(intptr_t(this))); +} + +void cloth::SwSolver::endFrame() const +{ + if(mProfiler) + mProfiler->stopEvent(mSimulateEventId, uint64_t(intptr_t(this)), uint32_t(intptr_t(this))); +} + +#if APEX_UE4 +void cloth::SwSolver::simulate(void* task, float dt) +{ + if (task) + static_cast<cloth::SwSolver::CpuClothSimulationTask*>(task)->simulate(dt); +} +#endif + +void cloth::SwSolver::StartSimulationTask::runInternal() +{ + mSolver->beginFrame(); + + CpuClothSimulationTaskVector::Iterator tIt = mSolver->mCpuClothSimulationTasks.begin(); + CpuClothSimulationTaskVector::Iterator tEnd = mSolver->mCpuClothSimulationTasks.end(); + + for(; tIt != tEnd; ++tIt) + { +#if APEX_UE4 + if (!(*tIt)->mCloth->isSleeping()) + { + (*tIt)->setContinuation(mCont); + (*tIt)->removeReference(); + } +#else + if(!tIt->mCloth->isSleeping()) + { + tIt->setContinuation(mCont); + tIt->removeReference(); + } +#endif + } +} + +const char* cloth::SwSolver::StartSimulationTask::getName() const +{ + return "cloth.SwSolver.startSimulation"; +} + +void cloth::SwSolver::EndSimulationTask::runInternal() +{ + mSolver->interCollision(); + mSolver->endFrame(); +} + +const char* cloth::SwSolver::EndSimulationTask::getName() const +{ + return "cloth.SwSolver.endSimulation"; +} + +#if !APEX_UE4 +cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, EndSimulationTask& continuation) +: mCloth(&cloth), mContinuation(&continuation), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f) +{ +} +#endif + +#if APEX_UE4 +cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, SwSolver& solver) + : mCloth(&cloth), mSolver(&solver), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f) +{ + mCloth->mSimulationTask = this; +} + +cloth::SwSolver::CpuClothSimulationTask::~CpuClothSimulationTask() +{ + deallocate(mScratchMemory); + mCloth->mSimulationTask = NULL; +} + +void cloth::SwSolver::CpuClothSimulationTask::runInternal() +{ + simulate(mSolver->mDt); +} + + +void cloth::SwSolver::CpuClothSimulationTask::simulate(float dt) +{ + // check if we need to reallocate the temp memory buffer + // (number of shapes may have changed) + uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth)); + + if (mScratchMemorySize < requiredTempMemorySize) + { + deallocate(mScratchMemory); + + mScratchMemory = allocate(requiredTempMemorySize); + mScratchMemorySize = requiredTempMemorySize; + } + + IterationStateFactory factory(*mCloth, dt); + mInvNumIterations = factory.mInvNumIterations; + + nvidia::SIMDGuard simdGuard; + + SwClothData data(*mCloth, mCloth->mFabric); + SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize)); + nvidia::profile::PxProfileZone* profileZone = mSolver->mProfiler; + + // construct kernel functor and execute +#if PX_ANDROID + // if(!neonSolverKernel(cloth, data, allocator, factory, profileZone)) +#endif + SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory, profileZone)(); + + data.reconcile(*mCloth); // update cloth + + release(); +} + +#else + +void cloth::SwSolver::CpuClothSimulationTask::runInternal() +{ + // check if we need to reallocate the temp memory buffer + // (number of shapes may have changed) + uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth)); + + if(mScratchMemorySize < requiredTempMemorySize) + { + deallocate(mScratchMemory); + + mScratchMemory = allocate(requiredTempMemorySize); + mScratchMemorySize = requiredTempMemorySize; + } + + if(mContinuation->mDt == 0.0f) + return; + + IterationStateFactory factory(*mCloth, mContinuation->mDt); + mInvNumIterations = factory.mInvNumIterations; + + nvidia::SIMDGuard simdGuard; + + SwClothData data(*mCloth, mCloth->mFabric); + SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize)); + nvidia::profile::PxProfileZone* profileZone = mContinuation->mSolver->mProfiler; + + // construct kernel functor and execute +#if PX_ANDROID + // if(!neonSolverKernel(cloth, data, allocator, factory, profileZone)) +#endif + SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory, profileZone)(); + + data.reconcile(*mCloth); // update cloth +} +#endif + +const char* cloth::SwSolver::CpuClothSimulationTask::getName() const +{ + return "cloth.SwSolver.cpuClothSimulation"; +} + +void cloth::SwSolver::CpuClothSimulationTask::release() +{ + mCloth->mMotionConstraints.pop(); + mCloth->mSeparationConstraints.pop(); + + if (!mCloth->mTargetCollisionSpheres.empty()) + { + swap(mCloth->mStartCollisionSpheres, mCloth->mTargetCollisionSpheres); + mCloth->mTargetCollisionSpheres.resize(0); + } + + if (!mCloth->mTargetCollisionPlanes.empty()) + { + swap(mCloth->mStartCollisionPlanes, mCloth->mTargetCollisionPlanes); + mCloth->mTargetCollisionPlanes.resize(0); + } + + if (!mCloth->mTargetCollisionTriangles.empty()) + { + swap(mCloth->mStartCollisionTriangles, mCloth->mTargetCollisionTriangles); + mCloth->mTargetCollisionTriangles.resize(0); + } +#if !APEX_UE4 + mContinuation->removeReference(); +#endif +} + +#if APEX_UE4 +void(*const cloth::SwCloth::sSimulationFunction)(void*, float) = &cloth::SwSolver::simulate; +#endif
\ No newline at end of file diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h new file mode 100644 index 00000000..472a5dba --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Solver.h" +#include "Allocator.h" +#include "SwInterCollision.h" +#include "CmTask.h" + +namespace nvidia +{ +namespace cloth +{ + +class SwCloth; +class SwFactory; + +/// CPU/SSE based cloth solver +class SwSolver : public UserAllocated, public Solver +{ + struct StartSimulationTask : public Cm::Task + { + using PxLightCpuTask::mRefCount; + using PxLightCpuTask::mTm; + + virtual void runInternal(); + virtual const char* getName() const; + + SwSolver* mSolver; + }; + + struct EndSimulationTask : public Cm::Task + { + using PxLightCpuTask::mRefCount; + + virtual void runInternal(); + virtual const char* getName() const; + + SwSolver* mSolver; +#if !APEX_UE4 + float mDt; +#endif + }; + + struct CpuClothSimulationTask : public Cm::Task + { +#if APEX_UE4 + void* operator new(size_t n){ return allocate(n); } + void operator delete(void* ptr) { return deallocate(ptr); } + + CpuClothSimulationTask(SwCloth&, SwSolver&); + ~CpuClothSimulationTask(); + + void simulate(float dt); + + SwSolver* mSolver; +#else + CpuClothSimulationTask(SwCloth&, EndSimulationTask&); + + EndSimulationTask* mContinuation; +#endif + virtual void runInternal(); + virtual const char* getName() const; + virtual void release(); + + SwCloth* mCloth; + + uint32_t mScratchMemorySize; + void* mScratchMemory; + float mInvNumIterations; + }; + + public: + SwSolver(nvidia::profile::PxProfileZone*, PxTaskManager*); + virtual ~SwSolver(); + + virtual void addCloth(Cloth*); + virtual void removeCloth(Cloth*); + + virtual PxBaseTask& simulate(float dt, PxBaseTask&); + + virtual void setInterCollisionDistance(float distance) + { + mInterCollisionDistance = distance; + } + virtual float getInterCollisionDistance() const + { + return mInterCollisionDistance; + } + + virtual void setInterCollisionStiffness(float stiffness) + { + mInterCollisionStiffness = stiffness; + } + virtual float getInterCollisionStiffness() const + { + return mInterCollisionStiffness; + } + + virtual void setInterCollisionNbIterations(uint32_t nbIterations) + { + mInterCollisionIterations = nbIterations; + } + virtual uint32_t getInterCollisionNbIterations() const + { + return mInterCollisionIterations; + } + + virtual void setInterCollisionFilter(InterCollisionFilter filter) + { + mInterCollisionFilter = filter; + } + + virtual uint32_t getNumSharedPositions( const Cloth* ) const + { + return uint32_t(-1); + } + + virtual bool hasError() const + { + return false; + } + +#if APEX_UE4 + static void simulate(void*, float); +#endif + + private: + void beginFrame() const; + void endFrame() const; + + void interCollision(); + + private: + StartSimulationTask mStartSimulationTask; + +#if APEX_UE4 + typedef Vector<CpuClothSimulationTask*>::Type CpuClothSimulationTaskVector; + float mDt; +#else + typedef Vector<CpuClothSimulationTask>::Type CpuClothSimulationTaskVector; +#endif + + CpuClothSimulationTaskVector mCpuClothSimulationTasks; + + EndSimulationTask mEndSimulationTask; + + profile::PxProfileZone* mProfiler; + uint16_t mSimulateEventId; + + float mInterCollisionDistance; + float mInterCollisionStiffness; + uint32_t mInterCollisionIterations; + InterCollisionFilter mInterCollisionFilter; + + void* mInterCollisionScratchMem; + uint32_t mInterCollisionScratchMemSize; + nvidia::Array<SwInterCollisionData> mInterCollisionInstances; + +}; +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp new file mode 100644 index 00000000..29f3fdc3 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "SwSolverKernel.h" +#include "SwCloth.h" +#include "SwClothData.h" +#include "SwFabric.h" +#include "SwFactory.h" +#include "PointInterpolator.h" +#include "BoundingBox.h" +#include "Simd4i.h" + +#if defined(_MSC_VER) && _MSC_VER >= 1600 && PX_WINDOWS_FAMILY +#define PX_AVX 1 + +namespace avx +{ +// defined in SwSolveConstraints.cpp + +void initialize(); + +template <bool, uint32_t> +void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict, + const __m128&); +} + +namespace +{ +uint32_t getAvxSupport() +{ +// Checking for AVX requires 3 things: +// 1) CPUID indicates that the OS uses XSAVE and XRSTORE +// 2) CPUID indicates support for AVX +// 3) XGETBV indicates registers are saved and restored on context switch + +#if _MSC_FULL_VER < 160040219 || !defined(_XCR_XFEATURE_ENABLED_MASK) + // need at least VC10 SP1 and compile on at least Win7 SP1 + return 0; +#else + int cpuInfo[4]; + __cpuid(cpuInfo, 1); + int avxFlags = 3 << 27; // checking 1) and 2) above + if((cpuInfo[2] & avxFlags) != avxFlags) + return 0; // xgetbv not enabled or no AVX support + + if((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) != 0x6) + return 0; // OS does not save YMM registers + + avx::initialize(); + +#if _MSC_VER < 1700 + return 1; +#else + int fmaFlags = 1 << 12; + if((cpuInfo[2] & fmaFlags) != fmaFlags) + return 1; // no FMA3 support + + /* only using fma at the moment, don't lock out AMD's piledriver by requiring avx2 + __cpuid(cpuInfo, 7); + int avx2Flags = 1 << 5; + if((cpuInfo[1] & avx2Flags) != avx2Flags) + return 1; // no AVX2 support + */ + + return 2; +#endif // _MSC_VER +#endif // _MSC_FULL_VER +} + +const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA +} +#endif + +using namespace nvidia; + +namespace +{ +/* simd constants */ + +typedef Simd4fFactory<detail::FourTuple> Simd4fConstant; + +const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0)); +const Simd4fConstant sMaskXY = simd4f(simd4i(~0, ~0, 0, 0)); +const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); +const Simd4fConstant sMaskYZW = simd4f(simd4i(0, ~0, ~0, ~0)); +const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON); +const Simd4fConstant sMinusOneXYZOneW = simd4f(-1.0f, -1.0f, -1.0f, 1.0f); +const Simd4fConstant sFloatMaxW = simd4f(0.0f, 0.0f, 0.0f, FLT_MAX); +const Simd4fConstant sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); + +/* static worker functions */ + +/** + This function performs explicit Euler integration based on position, where + x_next = x_cur + (x_cur - x_prev) * dt_cur/dt_prev * damping + g * dt * dt + The g * dt * dt term is folded into accelIt. + */ + +template <typename Simd4f, typename AccelerationIterator> +void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt, Simd4f scale, + const AccelerationIterator& aIt, const Simd4f& prevBias) +{ + // local copy to avoid LHS + AccelerationIterator accelIt(aIt); + + for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt) + { + Simd4f current = *curIt; + Simd4f previous = *prevIt; + // if(current.w == 0) current.w = previous.w + current = select(current > sMinusFloatMaxXYZ, current, previous); + Simd4f finiteMass = splat<3>(previous) > sFloatMaxW; + Simd4f delta = (current - previous) * scale + *accelIt; + *curIt = current + (delta & finiteMass); + *prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass); + } +} + +template <typename Simd4f, typename AccelerationIterator> +void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt, + const Simd4f (&prevMatrix)[3], const Simd4f (&curMatrix)[3], const AccelerationIterator& aIt, + const Simd4f& prevBias) +{ + // local copy to avoid LHS + AccelerationIterator accelIt(aIt); + + for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt) + { + Simd4f current = *curIt; + Simd4f previous = *prevIt; + // if(current.w == 0) current.w = previous.w + current = select(current > sMinusFloatMaxXYZ, current, previous); + Simd4f finiteMass = splat<3>(previous) > sFloatMaxW; + // curMatrix*current + prevMatrix*previous + accel + Simd4f delta = cloth::transform(curMatrix, cloth::transform(prevMatrix, *accelIt, previous), current); + *curIt = current + (delta & finiteMass); + *prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass); + } +} + +template <typename Simd4f, typename ConstraintIterator> +void constrainMotion(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres, + Simd4f scaleBiasStiffness) +{ + Simd4f scale = splat<0>(scaleBiasStiffness); + Simd4f bias = splat<1>(scaleBiasStiffness); + Simd4f stiffness = splat<3>(scaleBiasStiffness); + + // local copy of iterator to maintain alignment + ConstraintIterator sphIt = spheres; + + for(; curIt < curEnd; curIt += 4) + { + // todo: use msub where available + Simd4f curPos0 = curIt[0]; + Simd4f curPos1 = curIt[1]; + Simd4f curPos2 = curIt[2]; + Simd4f curPos3 = curIt[3]; + + Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0); + ++sphIt; + Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1); + ++sphIt; + Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2); + ++sphIt; + Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3); + ++sphIt; + + Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3; + transpose(deltaX, deltaY, deltaZ, deltaW); + + Simd4f sqrLength = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + Simd4f radius = max(simd4f(_0), deltaW * scale + bias); + + Simd4f slack = simd4f(_1) - radius * rsqrt(sqrLength); + + // if slack <= 0.0f then we don't want to affect particle + // and can skip if all particles are unaffected + Simd4f isPositive; + if(anyGreater(slack, simd4f(_0), isPositive)) + { + // set invMass to zero if radius is zero + curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ); + curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ); + curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ); + curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ); + + slack = slack * stiffness & isPositive; + + curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack); + curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack); + curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack); + curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack); + } + } +} + +template <typename Simd4f, typename ConstraintIterator> +void constrainSeparation(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres) +{ + // local copy of iterator to maintain alignment + ConstraintIterator sphIt = spheres; + + for(; curIt < curEnd; curIt += 4) + { + // todo: use msub where available + Simd4f curPos0 = curIt[0]; + Simd4f curPos1 = curIt[1]; + Simd4f curPos2 = curIt[2]; + Simd4f curPos3 = curIt[3]; + + Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0); + ++sphIt; + Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1); + ++sphIt; + Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2); + ++sphIt; + Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3); + ++sphIt; + + Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3; + transpose(deltaX, deltaY, deltaZ, deltaW); + + Simd4f sqrLength = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + + Simd4f slack = simd4f(_1) - deltaW * rsqrtT<1>(sqrLength); + + // if slack >= 0.0f then we don't want to affect particle + // and can skip if all particles are unaffected + Simd4f isNegative; + if(anyGreater(simd4f(_0), slack, isNegative)) + { + slack = slack & isNegative; + + curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack); + curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack); + curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack); + curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack); + } + } +} + +/** + traditional gauss-seidel internal constraint solver + */ +template <bool useMultiplier, typename Simd4f> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, Simd4f stiffness) +{ + Simd4f stretchLimit, compressionLimit, multiplier; + if(useMultiplier) + { + stretchLimit = splat<3>(stiffness); + compressionLimit = splat<2>(stiffness); + multiplier = splat<1>(stiffness); + } + stiffness = splat<0>(stiffness); + + for(; rIt != rEnd; rIt += 4, iIt += 8) + { + uint32_t p0i = iIt[0] * sizeof(PxVec4); + uint32_t p0j = iIt[1] * sizeof(PxVec4); + uint32_t p1i = iIt[2] * sizeof(PxVec4); + uint32_t p1j = iIt[3] * sizeof(PxVec4); + uint32_t p2i = iIt[4] * sizeof(PxVec4); + uint32_t p2j = iIt[5] * sizeof(PxVec4); + uint32_t p3i = iIt[6] * sizeof(PxVec4); + uint32_t p3j = iIt[7] * sizeof(PxVec4); + + Simd4f v0i = loadAligned(posIt, p0i); + Simd4f v0j = loadAligned(posIt, p0j); + Simd4f v1i = loadAligned(posIt, p1i); + Simd4f v1j = loadAligned(posIt, p1j); + Simd4f v2i = loadAligned(posIt, p2i); + Simd4f v2j = loadAligned(posIt, p2j); + Simd4f v3i = loadAligned(posIt, p3i); + Simd4f v3j = loadAligned(posIt, p3j); + + Simd4f h0ij = v0j + v0i * sMinusOneXYZOneW; + Simd4f h1ij = v1j + v1i * sMinusOneXYZOneW; + Simd4f h2ij = v2j + v2i * sMinusOneXYZOneW; + Simd4f h3ij = v3j + v3i * sMinusOneXYZOneW; + + Simd4f hxij = h0ij, hyij = h1ij, hzij = h2ij, vwij = h3ij; + transpose(hxij, hyij, hzij, vwij); + + Simd4f rij = loadAligned(rIt); + Simd4f e2ij = sEpsilon + hxij * hxij + hyij * hyij + hzij * hzij; + Simd4f erij = (simd4f(_1) - rij * rsqrt(e2ij)) & (rij > sEpsilon); // add parentheses for wiiu + + if(useMultiplier) + { + erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit)); + } + Simd4f exij = erij * stiffness * recip(sEpsilon + vwij); + + h0ij = h0ij * splat<0>(exij) & sMaskXYZ; + h1ij = h1ij * splat<1>(exij) & sMaskXYZ; + h2ij = h2ij * splat<2>(exij) & sMaskXYZ; + h3ij = h3ij * splat<3>(exij) & sMaskXYZ; + + storeAligned(posIt, p0i, v0i + h0ij * splat<3>(v0i)); + storeAligned(posIt, p0j, v0j - h0ij * splat<3>(v0j)); + storeAligned(posIt, p1i, v1i + h1ij * splat<3>(v1i)); + storeAligned(posIt, p1j, v1j - h1ij * splat<3>(v1j)); + storeAligned(posIt, p2i, v2i + h2ij * splat<3>(v2i)); + storeAligned(posIt, p2j, v2j - h2ij * splat<3>(v2j)); + storeAligned(posIt, p3i, v3i + h3ij * splat<3>(v3i)); + storeAligned(posIt, p3j, v3j - h3ij * splat<3>(v3j)); + } +} + +#if PX_WINDOWS_FAMILY +#include "sse2/SwSolveConstraints.h" +#endif + +// calculates upper bound of all position deltas +template <typename Simd4f> +Simd4f calculateMaxDelta(const Simd4f* prevIt, const Simd4f* curIt, const Simd4f* curEnd) +{ + Simd4f maxDelta(simd4f(_0)); + for(; curIt < curEnd; ++curIt, ++prevIt) + maxDelta = max(maxDelta, abs(*curIt - *prevIt)); + + return maxDelta & sMaskXYZ; +} + +} // anonymous namespace + +template <typename Simd4f> +cloth::SwSolverKernel<Simd4f>::SwSolverKernel(SwCloth const& cloth, SwClothData& clothData, SwKernelAllocator& allocator, + IterationStateFactory& factory, profile::PxProfileZone* profiler) +: mCloth(cloth) +, mClothData(clothData) +, mAllocator(allocator) +, mCollision(clothData, allocator, profiler) +, mSelfCollision(clothData, allocator) +, mState(factory.create<Simd4f>(cloth)) +, mProfiler(profiler) +{ + mClothData.verify(); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::operator()() +{ + simulateCloth(); +} + +template <typename Simd4f> +size_t cloth::SwSolverKernel<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth) +{ + size_t collisionTempMemory = SwCollision<Simd4f>::estimateTemporaryMemory(cloth); + size_t selfCollisionTempMemory = SwSelfCollision<Simd4f>::estimateTemporaryMemory(cloth); + + size_t tempMemory = PxMax(collisionTempMemory, selfCollisionTempMemory); + size_t persistentMemory = SwCollision<Simd4f>::estimatePersistentMemory(cloth); + + // account for any allocator overhead (this could be exposed in the allocator) + size_t maxAllocs = 32; + size_t maxPerAllocationOverhead = 32; + size_t maxAllocatorOverhead = maxAllocs * maxPerAllocationOverhead; + + return maxAllocatorOverhead + persistentMemory + tempMemory; +} + +template <typename Simd4f> +template <typename AccelerationIterator> +void cloth::SwSolverKernel<Simd4f>::integrateParticles(AccelerationIterator& accelIt, const Simd4f& prevBias) +{ + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles); + + if(!mState.mIsTurning) + ::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix[0], accelIt, prevBias); + else + ::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix, mState.mCurMatrix, accelIt, prevBias); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::integrateParticles() +{ + ProfileZone zone("cloth::SwSolverKernel::integrateParticles", mProfiler); + + const Simd4f* startAccelIt = reinterpret_cast<const Simd4f*>(mClothData.mParticleAccelerations); + + // dt^2 (todo: should this be the smoothed dt used for gravity?) + const Simd4f sqrIterDt = simd4f(sqr(mState.mIterDt)) & (Simd4f)sMaskXYZ; + + if(!startAccelIt) + { + // no per-particle accelerations, use a constant + ConstantIterator<Simd4f> accelIt(mState.mCurBias); + integrateParticles(accelIt, mState.mPrevBias); + } + else + { + // iterator implicitly scales by dt^2 and adds gravity + ScaleBiasIterator<Simd4f, const Simd4f*> accelIt(startAccelIt, sqrIterDt, mState.mCurBias); + integrateParticles(accelIt, mState.mPrevBias); + } + + zone.setValue(mState.mIsTurning); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::constrainTether() +{ + if(0.0f == mClothData.mTetherConstraintStiffness || !mClothData.mNumTethers) + return; + +#if PX_PROFILE + ProfileZone zone("cloth::SwSolverKernel::solveTethers", mProfiler); +#endif + + uint32_t numParticles = mClothData.mNumParticles; + uint32_t numTethers = mClothData.mNumTethers; + PX_ASSERT(0 == numTethers % numParticles); + + float* __restrict curIt = mClothData.mCurParticles; + const float* __restrict curFirst = curIt; + const float* __restrict curEnd = curIt + 4 * numParticles; + + typedef const SwTether* __restrict TetherIter; + TetherIter tFirst = mClothData.mTethers; + TetherIter tEnd = tFirst + numTethers; + + Simd4f stiffness = (Simd4f)sMaskXYZ & simd4f(numParticles * mClothData.mTetherConstraintStiffness / numTethers); + Simd4f scale = simd4f(mClothData.mTetherConstraintScale); + + for(; curIt != curEnd; curIt += 4, ++tFirst) + { + Simd4f position = loadAligned(curIt); + Simd4f offset = simd4f(_0); + + for(TetherIter tIt = tFirst; tIt < tEnd; tIt += numParticles) + { + PX_ASSERT(tIt->mAnchor < numParticles); + Simd4f anchor = loadAligned(curFirst, tIt->mAnchor * sizeof(PxVec4)); + Simd4f delta = anchor - position; + Simd4f sqrLength = sEpsilon + dot3(delta, delta); + + Simd4f tetherLength = load(&tIt->mLength); + tetherLength = splat<0>(tetherLength); + + Simd4f radius = tetherLength * scale; + Simd4f slack = simd4f(_1) - radius * rsqrt(sqrLength); + + offset = offset + delta * max(slack, simd4f(_0)); + } + + storeAligned(curIt, position + offset * stiffness); + } +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::solveFabric() +{ + ProfileZone zone("cloth::SwSolverKernel::solveFabric", mProfiler); + + float* pIt = mClothData.mCurParticles; + + const PhaseConfig* cIt = mClothData.mConfigBegin; + const PhaseConfig* cEnd = mClothData.mConfigEnd; + + const uint32_t* pBegin = mClothData.mPhases; + const float* rBegin = mClothData.mRestvalues; + + const uint32_t* sBegin = mClothData.mSets; + const uint16_t* iBegin = mClothData.mIndices; + + uint32_t totalConstraints = 0; + + Simd4f stiffnessExponent = simd4f(mCloth.mStiffnessFrequency * mState.mIterDt); + + for(; cIt != cEnd; ++cIt) + { + const uint32_t* sIt = sBegin + pBegin[cIt->mPhaseIndex]; + const float* rIt = rBegin + sIt[0]; + const float* rEnd = rBegin + sIt[1]; + const uint16_t* iIt = iBegin + sIt[0] * 2; + + totalConstraints += uint32_t(rEnd - rIt); + + // (stiffness, multiplier, compressionLimit, stretchLimit) + Simd4f config = load(&cIt->mStiffness); + // stiffness specified as fraction of constraint error per-millisecond + Simd4f scaledConfig = simd4f(_1) - simdf::exp2(config * stiffnessExponent); + Simd4f stiffness = select(sMaskXY, scaledConfig, config); + + int neutralMultiplier = allEqual(sMaskYZW & stiffness, simd4f(_0)); + +#if PX_AVX + switch(sAvxSupport) + { + case 2: +#if _MSC_VER >= 1700 + neutralMultiplier ? avx::solveConstraints<false, 2>(pIt, rIt, rEnd, iIt, stiffness) + : avx::solveConstraints<true, 2>(pIt, rIt, rEnd, iIt, stiffness); + break; +#endif + case 1: + neutralMultiplier ? avx::solveConstraints<false, 1>(pIt, rIt, rEnd, iIt, stiffness) + : avx::solveConstraints<true, 1>(pIt, rIt, rEnd, iIt, stiffness); + break; + default: +#endif + neutralMultiplier ? solveConstraints<false>(pIt, rIt, rEnd, iIt, stiffness) + : solveConstraints<true>(pIt, rIt, rEnd, iIt, stiffness); +#if PX_AVX + break; + } +#endif + } + + zone.setValue(totalConstraints); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::constrainMotion() +{ + if(!mClothData.mStartMotionConstraints) + return; + +#if PX_PROFILE + ProfileZone zone("cloth::SwSolverKernel::constrainMotion", mProfiler); +#endif + + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + + const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartMotionConstraints); + const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetMotionConstraints); + + Simd4f scaleBias = load(&mCloth.mMotionConstraintScale); + Simd4f stiffness = simd4f(mClothData.mMotionConstraintStiffness); + Simd4f scaleBiasStiffness = select(sMaskXYZ, scaleBias, stiffness); + + if(!mClothData.mTargetMotionConstraints) + // no interpolation, use the start positions + return ::constrainMotion(curIt, curEnd, startIt, scaleBiasStiffness); + + if(mState.mRemainingIterations == 1) + // use the target positions on last iteration + return ::constrainMotion(curIt, curEnd, targetIt, scaleBiasStiffness); + + // otherwise use an interpolating iterator + LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha()); + ::constrainMotion(curIt, curEnd, interpolator, scaleBiasStiffness); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::constrainSeparation() +{ + if(!mClothData.mStartSeparationConstraints) + return; + +#if PX_PROFILE + ProfileZone zone("cloth::SwSolverKernel::constrainSeparation", mProfiler); +#endif + + Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + Simd4f* curEnd = curIt + mClothData.mNumParticles; + + const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartSeparationConstraints); + const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetSeparationConstraints); + + if(!mClothData.mTargetSeparationConstraints) + // no interpolation, use the start positions + return ::constrainSeparation(curIt, curEnd, startIt); + + if(mState.mRemainingIterations == 1) + // use the target positions on last iteration + return ::constrainSeparation(curIt, curEnd, targetIt); + + // otherwise use an interpolating iterator + LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha()); + ::constrainSeparation(curIt, curEnd, interpolator); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::collideParticles() +{ + ProfileZone zone("cloth::SwSolverKernel::collideParticles", mProfiler); + + mCollision(mState); + + zone.setValue(mCollision.mNumCollisions); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::selfCollideParticles() +{ + ProfileZone zone("cloth::SwSolverKernel::selfCollideParticles", mProfiler); + + mSelfCollision(); + + zone.setValue(mSelfCollision.mNumCollisions); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::updateSleepState() +{ + ProfileZone zone("cloth::SwSolverKernel::updateSleepState", mProfiler); + + mClothData.mSleepTestCounter += PxMax(1u, uint32_t(mState.mIterDt * 1000)); + if(mClothData.mSleepTestCounter >= mCloth.mSleepTestInterval) + { + const Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles); + const Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles); + const Simd4f* curEnd = curIt + mClothData.mNumParticles; + + // calculate max particle delta since last iteration + Simd4f maxDelta = calculateMaxDelta(prevIt, curIt, curEnd); + + ++mClothData.mSleepPassCounter; + Simd4f threshold = simd4f(mCloth.mSleepThreshold * mState.mIterDt); + if(anyGreaterEqual(maxDelta, threshold)) + mClothData.mSleepPassCounter = 0; + + mClothData.mSleepTestCounter -= mCloth.mSleepTestInterval; + } + + zone.setValue(mClothData.mSleepPassCounter); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::iterateCloth() +{ + // note on invMass (stored in current/previous positions.w): + // integrateParticles() + // - if(current.w == 0) current.w = previous.w + // constraintMotion() + // - if(constraint.radius <= 0) current.w = 0 + // computeBounds() + // - if(current.w > 0) current.w = previous.w + // collideParticles() + // - if(collides) current.w *= 1/massScale + // after simulate() + // - previous.w: original invMass as set by user + // - current.w: zeroed by motion constraints and mass-scaled by collision + + // integrate positions + integrateParticles(); + + // motion constraints + constrainMotion(); + + // solve tether constraints + constrainTether(); + + // solve edge constraints + solveFabric(); + + // separation constraints + constrainSeparation(); + + // perform character collision + collideParticles(); + + // perform self collision + selfCollideParticles(); + + // test wake / sleep conditions + updateSleepState(); +} + +template <typename Simd4f> +void cloth::SwSolverKernel<Simd4f>::simulateCloth() +{ + while(mState.mRemainingIterations) + { + iterateCloth(); + mState.update(); + } +} + +// explicit template instantiation +#if NVMATH_SIMD +template class cloth::SwSolverKernel<Simd4f>; +#endif +#if NVMATH_SCALAR +template class cloth::SwSolverKernel<Scalar4f>; +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h new file mode 100644 index 00000000..26b45a88 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "IterationState.h" +#include "SwCollision.h" +#include "SwSelfCollision.h" + +namespace nvidia +{ +namespace cloth +{ + +class SwCloth; +struct SwClothData; + +template <typename Simd4f> +class SwSolverKernel +{ + public: + SwSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&, nvidia::profile::PxProfileZone*); + + void operator()(); + + // returns a conservative estimate of the + // total memory requirements during a solve + static size_t estimateTemporaryMemory(const SwCloth& c); + + private: + void integrateParticles(); + void constrainTether(); + void solveFabric(); + void constrainMotion(); + void constrainSeparation(); + void collideParticles(); + void selfCollideParticles(); + void updateSleepState(); + + void iterateCloth(); + void simulateCloth(); + + SwCloth const& mCloth; + SwClothData& mClothData; + SwKernelAllocator& mAllocator; + + SwCollision<Simd4f> mCollision; + SwSelfCollision<Simd4f> mSelfCollision; + IterationState<Simd4f> mState; + + profile::PxProfileZone* mProfiler; + + private: + SwSolverKernel<Simd4f>& operator=(const SwSolverKernel<Simd4f>&); + template <typename AccelerationIterator> + void integrateParticles(AccelerationIterator& accelIt, const Simd4f&); +}; +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp new file mode 100644 index 00000000..d077624e --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "TripletScheduler.h" +#include "PxMath.h" +#include "PsFPU.h" +#include "PxMat33.h" +#include "PsVecMath.h" +#include "PsUtilities.h" + +using namespace nvidia; +using namespace physx::shdfnd::aos; + +cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets) +: mTriplets(reinterpret_cast<const Vec4u*>(triplets.begin()), reinterpret_cast<const Vec4u*>(triplets.end())) +{ +} + +// SSE version +void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth) +{ + if(mTriplets.empty()) + return; + + Vector<uint32_t>::Type mark(numParticles, uint32_t(-1)); + + uint32_t setIndex = 0, setSize = 0; + for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex) + { + TripletIter tLast = tIt + PxMin(simdWidth, uint32_t(tEnd - tIt)); + TripletIter tSwap = tEnd; + + for(; tIt != tLast && tIt != tSwap; ++tIt, ++setSize) + { + // swap from tail until independent triplet found + while((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap) + swap(*tIt, *tSwap); + + if(tIt == tSwap) + break; // no independent triplet found + + // mark vertices to be used in simdIndex + mark[tIt->x] = setIndex; + mark[tIt->y] = setIndex; + mark[tIt->z] = setIndex; + } + + if(tIt == tSwap) // remaining triplets depend on current set + { + if(setSize > simdWidth) // trim set to multiple of simdWidth + { + uint32_t overflow = setSize % simdWidth; + setSize -= overflow; + tIt -= overflow; + } + mSetSizes.pushBack(setSize); + setSize = 0; + } + } +} + +namespace +{ +struct TripletSet +{ + TripletSet() : mMark(0xFFFFFFFF) + { + mNumReplays[0] = mNumReplays[1] = mNumReplays[2] = 1; + memset(mNumConflicts[0], 0, 32); + memset(mNumConflicts[1], 0, 32); + memset(mNumConflicts[2], 0, 32); + } + + uint32_t mMark; // triplet index + uint8_t mNumReplays[3]; + uint8_t mNumConflicts[3][32]; +}; + +/* +struct GreaterSum +{ + typedef cloth::Vector<uint32_t>::Type Container; + + GreaterSum(const Container& cont) + : mContainer(cont) + {} + + bool operator()(const cloth::Vec4u& a, const cloth::Vec4u& b) const + { + return mContainer[a.x] + mContainer[a.y] + mContainer[a.z] + > mContainer[b.x] + mContainer[b.y] + mContainer[b.z]; + } + + const Container& mContainer; +}; +*/ + +// calculate the inclusive prefix sum, equivalent of std::partial_sum +template <typename T> +void prefixSum(const T* first, const T* last, T* dest) +{ + if(first == last) + return; + else + { + *(dest++) = *(first++); + + for(; first != last; ++first, ++dest) + *dest = *(dest - 1) + *first; + } +} +} + +// CUDA version +void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth) +{ + // PX_ASSERT(warpWidth == 32 || warpWidth == 16); + + if(mTriplets.empty()) + return; + + TripletIter tIt, tEnd = mTriplets.end(); + uint32_t tripletIndex; + + // count number of triplets per particle + Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0)); + for(tIt = mTriplets.begin(); tIt != tEnd; ++tIt) + for(int i = 0; i < 3; ++i) + ++adjacentCount[(*tIt)[i]]; + + /* neither of those were really improving number of batches: + // run simd version to pre-sort particles + simd(numParticles, blockWidth); mSetSizes.resize(0); + // sort according to triplet degree (estimated by sum of adjacentCount) + std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount)); + */ + + uint32_t maxTripletCount = *maxElement(adjacentCount.begin(), adjacentCount.end()); + + // compute in place prefix sum (inclusive) + prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin()); + + // initialize adjacencies (for each particle, collect touching triplets) + // also converts partial sum in adjacentCount from inclusive to exclusive + Vector<uint32_t>::Type adjacencies(adjacentCount.back()); + for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex) + for(int i = 0; i < 3; ++i) + adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex; + + uint32_t warpMask = warpWidth - 1; + + uint32_t numSets = maxTripletCount; // start with minimum number of sets + Vector<TripletSet>::Type sets(numSets); + Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1)); + mSetSizes.resize(numSets); + + // color triplets (assign to sets) + Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd; + for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex) + { + // mark sets of adjacent triplets + for(int i = 0; i < 3; ++i) + { + uint32_t particleIndex = (*tIt)[i]; + aIt = aBegin + adjacentCount[particleIndex]; + aEnd = aBegin + adjacentCount[particleIndex + 1]; + for(uint32_t setIndex; aIt != aEnd; ++aIt) + if(numSets > (setIndex = setIndices[*aIt])) + sets[setIndex].mMark = tripletIndex; + } + + // find valid set with smallest number of bank conflicts + uint32_t bestIndex = numSets; + uint32_t minReplays = 4; + for(uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex) + { + const TripletSet& set = sets[setIndex]; + + if(set.mMark == tripletIndex) + continue; // triplet collision + + uint32_t numReplays = 0; + for(uint32_t i = 0; i < 3; ++i) + numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]]; + + if(minReplays > numReplays) + minReplays = numReplays, bestIndex = setIndex; + } + + // add new set if none found + if(bestIndex == numSets) + { + sets.pushBack(TripletSet()); + mSetSizes.pushBack(0); + ++numSets; + } + + // increment bank conflicts or reset if warp filled + TripletSet& set = sets[bestIndex]; + if(++mSetSizes[bestIndex] & warpMask) + for(uint32_t i = 0; i < 3; ++i) + set.mNumReplays[i] = PxMax(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]); + else + set = TripletSet(); + + setIndices[tripletIndex] = bestIndex; + } + + // reorder triplets + Vector<uint32_t>::Type setOffsets(mSetSizes.size()); + prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin()); + + Vector<Vec4u>::Type triplets(mTriplets.size()); + Vector<uint32_t>::Type::ConstIterator iIt = setIndices.begin(); + for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++iIt) + triplets[--setOffsets[*iIt]] = *tIt; + + mTriplets.swap(triplets); +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h new file mode 100644 index 00000000..836c9784 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#include "Range.h" +#include "Allocator.h" +#include "Vec4T.h" + +namespace nvidia +{ + +namespace cloth +{ + +struct TripletScheduler +{ + typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter; + typedef Vector<Vec4u>::Type::Iterator TripletIter; + + TripletScheduler(Range<const uint32_t[4]>); + void simd(uint32_t numParticles, uint32_t simdWidth); + void warp(uint32_t numParticles, uint32_t warpWidth); + + Vector<Vec4u>::Type mTriplets; + Vector<uint32_t>::Type mSetSizes; +}; +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h new file mode 100644 index 00000000..c82b9629 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace nvidia +{ + +namespace cloth +{ + +template <typename T> +struct Vec4T +{ + Vec4T() + { + } + + Vec4T(T a, T b, T c, T d) : x(a), y(b), z(c), w(d) + { + } + + template <typename S> + Vec4T(const Vec4T<S>& other) + { + x = T(other.x); + y = T(other.y); + z = T(other.z); + w = T(other.w); + } + + template <typename Index> + T& operator[](Index i) + { + return reinterpret_cast<T*>(this)[i]; + } + + template <typename Index> + const T& operator[](Index i) const + { + return reinterpret_cast<const T*>(this)[i]; + } + + T x, y, z, w; +}; + +template <typename T> +Vec4T<T> operator*(const Vec4T<T>& vec, T scalar) +{ + return Vec4T<T>(vec.x * scalar, vec.y * scalar, vec.z * scalar, vec.w * scalar); +} + +template <typename T> +Vec4T<T> operator/(const Vec4T<T>& vec, T scalar) +{ + return Vec4T<T>(vec.x / scalar, vec.y / scalar, vec.z / scalar, vec.w / scalar); +} + +template <typename T> +T (&array(Vec4T<T>& vec))[4] +{ + return reinterpret_cast<T(&)[4]>(vec); +} + +template <typename T> +const T (&array(const Vec4T<T>& vec))[4] +{ + return reinterpret_cast<const T(&)[4]>(vec); +} + +typedef Vec4T<uint32_t> Vec4u; +typedef Vec4T<uint16_t> Vec4us; + +} // namespace cloth + +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp new file mode 100644 index 00000000..b9a6ab35 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma warning(push) +#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' +#include <intrin.h> +#pragma warning(pop) + +#pragma warning(disable : 4127) // conditional expression is constant + +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; + +namespace avx +{ +__m128 sMaskYZW; +__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY; + +void initialize() +{ + sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0)); + sOne = _mm256_set1_ps(1.0f); + sEpsilon = _mm256_set1_ps(1.192092896e-07f); + sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f); + sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0)); +} + +template <uint32_t> +__m256 fmadd_ps(__m256 a, __m256 b, __m256 c) +{ + return _mm256_add_ps(_mm256_mul_ps(a, b), c); +} +template <uint32_t> +__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c) +{ + return _mm256_sub_ps(c, _mm256_mul_ps(a, b)); +} +#if _MSC_VER >= 1700 +template <> +__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c) +{ + return _mm256_fmadd_ps(a, b, c); +} +template <> +__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c) +{ + return _mm256_fnmadd_ps(a, b, c); +} +#endif + +// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster +template <bool useMultiplier, uint32_t avx> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness, stretchLimit, compressionLimit, multiplier; + + if(useMultiplier) + { + stiffness = _mm256_broadcast_ps(&stiffnessRef); + stretchLimit = _mm256_permute_ps(stiffness, 0xff); + compressionLimit = _mm256_permute_ps(stiffness, 0xaa); + multiplier = _mm256_permute_ps(stiffness, 0x55); + stiffness = _mm256_permute_ps(stiffness, 0x00); + } + else + { + stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef); + } + + for(; rIt < rEnd; rIt += 8, iIt += 16) + { + float* p0i = posIt + iIt[0] * 4; + float* p4i = posIt + iIt[8] * 4; + float* p0j = posIt + iIt[1] * 4; + float* p4j = posIt + iIt[9] * 4; + float* p1i = posIt + iIt[2] * 4; + float* p5i = posIt + iIt[10] * 4; + float* p1j = posIt + iIt[3] * 4; + float* p5j = posIt + iIt[11] * 4; + + __m128 v0i = _mm_load_ps(p0i); + __m128 v4i = _mm_load_ps(p4i); + __m128 v0j = _mm_load_ps(p0j); + __m128 v4j = _mm_load_ps(p4j); + __m128 v1i = _mm_load_ps(p1i); + __m128 v5i = _mm_load_ps(p5i); + __m128 v1j = _mm_load_ps(p1j); + __m128 v5j = _mm_load_ps(p5j); + + __m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1); + __m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1); + __m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1); + __m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1); + + __m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j); + __m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j); + + float* p2i = posIt + iIt[4] * 4; + float* p6i = posIt + iIt[12] * 4; + float* p2j = posIt + iIt[5] * 4; + float* p6j = posIt + iIt[13] * 4; + float* p3i = posIt + iIt[6] * 4; + float* p7i = posIt + iIt[14] * 4; + float* p3j = posIt + iIt[7] * 4; + float* p7j = posIt + iIt[15] * 4; + + __m128 v2i = _mm_load_ps(p2i); + __m128 v6i = _mm_load_ps(p6i); + __m128 v2j = _mm_load_ps(p2j); + __m128 v6j = _mm_load_ps(p6j); + __m128 v3i = _mm_load_ps(p3i); + __m128 v7i = _mm_load_ps(p7i); + __m128 v3j = _mm_load_ps(p3j); + __m128 v7j = _mm_load_ps(p7j); + + __m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1); + __m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1); + __m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1); + __m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1); + + __m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j); + __m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j); + + __m256 a = _mm256_unpacklo_ps(h04ij, h26ij); + __m256 b = _mm256_unpackhi_ps(h04ij, h26ij); + __m256 c = _mm256_unpacklo_ps(h15ij, h37ij); + __m256 d = _mm256_unpackhi_ps(h15ij, h37ij); + + __m256 hxij = _mm256_unpacklo_ps(a, c); + __m256 hyij = _mm256_unpackhi_ps(a, c); + __m256 hzij = _mm256_unpacklo_ps(b, d); + __m256 vwij = _mm256_unpackhi_ps(b, d); + + __m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon))); + + __m256 rij = _mm256_load_ps(rIt); + __m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ); + __m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask); + + if(useMultiplier) + { + erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij); + } + + __m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stiffness, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij)))); + + // replace these two instructions with _mm_maskstore_ps below? + __m256 exlo = _mm256_and_ps(sMaskXY, exij); + __m256 exhi = _mm256_andnot_ps(sMaskXY, exij); + + __m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0)); + __m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i); + __m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j); + + _mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0)); + _mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0)); + _mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1)); + _mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1)); + + __m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5)); + __m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i); + __m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j); + + _mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0)); + _mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0)); + _mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1)); + _mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1)); + + __m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a)); + __m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i); + __m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j); + + _mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0)); + _mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0)); + _mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1)); + _mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1)); + + __m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f)); + __m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i); + __m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j); + + _mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0)); + _mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0)); + _mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1)); + _mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1)); + } + + _mm256_zeroupper(); +} + +#ifdef _M_IX86 + +// clang-format:disable + +/* full template specializations of above functions in assembler */ + +// AVX without useMultiplier +template <> +void solveConstraints<false, 1>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij + vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2 + vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij + vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} + +// AVX with useMultiplier +template <> +void solveConstraints<true, 1>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef); + __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff); + __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa); + __m256 multiplier = _mm256_permute_ps(stiffness, 0x55); + stiffness = _mm256_permute_ps(stiffness, 0x00); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij + vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2 + vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij + vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmovaps ymm0, stretchLimit // multiplier block + vmovaps ymm1, compressionLimit + vmovaps ymm2, multiplier + vminps ymm0, ymm0, ymm5 + vmaxps ymm1, ymm1, ymm0 + vmulps ymm2, ymm2, ymm1 + vsubps ymm5, ymm5, ymm2 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i + vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} + +#if _MSC_VER >= 1700 +// AVX2 without useMultiplier +template <> +void solveConstraints<false, 2>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vfmadd213ps ymm4, ymm4, ymm7 // e2ij + vfmadd213ps ymm2, ymm2, ymm4 + vfmadd213ps ymm0, ymm0, ymm2 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij + vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} + +// AVX2 with useMultiplier +template <> +void solveConstraints<true, 2>(float* __restrict posIt, const float* __restrict rIt, + const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef) +{ + __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef); + __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff); + __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa); + __m256 multiplier = _mm256_permute_ps(stiffness, 0x55); + stiffness = _mm256_permute_ps(stiffness, 0x00); + + __m256 vtmp[8], htmp[4]; + float* ptmp[16]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i + movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j + movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i + movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j + movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i + movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j + movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i + movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j + movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j + + vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i + vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j + vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i + vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j + + vmovaps ymm7, sMinusOneXYZOneW + vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij + vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij + + vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij + + vunpcklps ymm1, ymm0, ymm2 // a + vunpckhps ymm3, ymm0, ymm2 // b + vunpcklps ymm5, ymm4, ymm6 // c + vunpckhps ymm7, ymm4, ymm6 // d + + vunpcklps ymm0, ymm1, ymm5 // hxij + vunpckhps ymm2, ymm1, ymm5 // hyij + vunpcklps ymm4, ymm3, ymm7 // hzij + vunpckhps ymm6, ymm3, ymm7 // vwij + + vmovaps ymm7, sEpsilon + vmovaps ymm5, sOne + vmovaps ymm3, stiffness + vmovaps ymm1, YMMWORD PTR [edx] // rij + + vfmadd213ps ymm4, ymm4, ymm7 // e2ij + vfmadd213ps ymm2, ymm2, ymm4 + vfmadd213ps ymm0, ymm0, ymm2 + + vcmpgt_oqps ymm2, ymm1, ymm7 // mask + vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij + vandps ymm5, ymm5, ymm2 + vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6 + + vmovaps ymm0, stretchLimit // multiplier block + vmovaps ymm1, compressionLimit + vmovaps ymm2, multiplier + vminps ymm0, ymm0, ymm5 + vmaxps ymm1, ymm1, ymm0 + vfnmadd231ps ymm5, ymm1, ymm2 + + vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij + + vmovaps ymm7, sMaskXY + vandps ymm7, ymm7, ymm6 // exlo + vxorps ymm6, ymm6, ymm7 // exhi + + vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij + vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i + vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j + + vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i + mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j + mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i + mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j + + vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij + vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i + vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j + + vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i + mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j + mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i + mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j + + vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij + vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i + vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j + + vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i + mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j + mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i + mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j + + vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij + vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i + vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j + + vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij + vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i + vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j + + vextractf128 xmm2, ymm0, 1 + vextractf128 xmm3, ymm1, 1 + + mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i + mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j + mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i + mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j + + add eax, 32 + add edx, 32 + + cmp edx, esi + jb forBegin +forEnd: + } + + _mm256_zeroupper(); +} +#endif // _MSC_VER >= 1700 + +// clang-format:enable + +#else // _M_IX86 + +template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&); + +#endif // _M_IX86 + +} // namespace avx diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp new file mode 100644 index 00000000..01f1fb50 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef __ARM_NEON__ +#error This file needs to be compiled with NEON support! +#endif + +#include "SwCollision.cpp" diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp new file mode 100644 index 00000000..d272bb6d --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef __ARM_NEON__ +#error This file needs to be compiled with NEON support! +#endif + +#include "SwSelfCollision.cpp" diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp new file mode 100644 index 00000000..068c900a --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef __ARM_NEON__ +#error This file needs to be compiled with NEON support! +#endif + +#include "SwSolverKernel.cpp" + +#include <cpu-features.h> + +namespace nvidia +{ +namespace cloth +{ +bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator, + IterationStateFactory& factory, PxProfileZone* profileZone) +{ + return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() && + (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true); +} +} +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h new file mode 100644 index 00000000..0c0b884c --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Simd4f() const +{ + return vdupq_n_f32(reinterpret_cast<const float32_t&>(v)); +} + +inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const +{ + return reinterpret_cast<const Simd4f&>(v); +} + +template <int i> +inline Simd4fFactory<detail::IntType<i> >::operator Simd4f() const +{ + return vdupq_n_u32(i); +} + +template <> +inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const +{ + return vdupq_n_f32(1.0f); +} + +template <> +inline Simd4fFactory<const float*>::operator Simd4f() const +{ + return vld1q_f32((const float32_t*)v); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const +{ + return vld1q_f32((const float32_t*)v.ptr); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const +{ + return vld1q_f32(reinterpret_cast<const float32_t*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression templates +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4f>::operator Simd4f() const +{ + return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4); +} + +Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +ProductExpr::operator Simd4f() const +{ + return vmulq_f32(v0.f4, v1.f4); +} + +Simd4f operator+(const ProductExpr& p, const Simd4f& v) +{ + return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator+(const Simd4f& v, const ProductExpr& p) +{ + return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vmlaq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4); +} + +Simd4f operator-(const Simd4f& v, const ProductExpr& p) +{ + return vmlsq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vmlsq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f operator==(const Simd4f& v0, const Simd4f& v1) +{ + return vceqq_f32(v0.f4, v1.f4); +} + +Simd4f operator<(const Simd4f& v0, const Simd4f& v1) +{ + return vcltq_f32(v0.f4, v1.f4); +} + +Simd4f operator<=(const Simd4f& v0, const Simd4f& v1) +{ + return vcleq_f32(v0.f4, v1.f4); +} + +Simd4f operator>(const Simd4f& v0, const Simd4f& v1) +{ + return vcgtq_f32(v0.f4, v1.f4); +} + +Simd4f operator>=(const Simd4f& v0, const Simd4f& v1) +{ + return vcgeq_f32(v0.f4, v1.f4); +} + +ComplementExpr<Simd4f> operator~(const Simd4f& v) +{ + return ComplementExpr<Simd4f>(v); +} + +Simd4f operator&(const Simd4f& v0, const Simd4f& v1) +{ + return vandq_u32(v0.u4, v1.u4); +} + +Simd4f operator|(const Simd4f& v0, const Simd4f& v1) +{ + return vorrq_u32(v0.u4, v1.u4); +} + +Simd4f operator^(const Simd4f& v0, const Simd4f& v1) +{ + return veorq_u32(v0.u4, v1.u4); +} + +Simd4f operator<<(const Simd4f& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(shift)); +} + +Simd4f operator>>(const Simd4f& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(-shift)); +} + +Simd4f operator<<(const Simd4f& v, const Simd4f& shift) +{ + return vshlq_u32(v.u4, shift.i4); +} + +Simd4f operator>>(const Simd4f& v, const Simd4f& shift) +{ + return vshlq_u32(v.u4, vnegq_s32(shift.i4)); +} + +Simd4f operator+(const Simd4f& v) +{ + return v; +} + +Simd4f operator+(const Simd4f& v0, const Simd4f& v1) +{ + return vaddq_f32(v0.f4, v1.f4); +} + +Simd4f operator-(const Simd4f& v) +{ + return vnegq_f32(v.f4); +} + +Simd4f operator-(const Simd4f& v0, const Simd4f& v1) +{ + return vsubq_f32(v0.f4, v1.f4); +} + +ProductExpr operator*(const Simd4f& v0, const Simd4f& v1) +{ + return ProductExpr(v0, v1); +} + +Simd4f operator/(const Simd4f& v0, const Simd4f& v1) +{ + return v0 * vrecpeq_f32(v1.f4); // reciprocal estimate +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f simd4f(const Simd4i& v) +{ + return v.u4; +} + +float (&array(Simd4f& v))[4] +{ + return (float(&)[4])v; +} + +const float (&array(const Simd4f& v))[4] +{ + return (const float(&)[4])v; +} + +void store(float* ptr, Simd4f const& v) +{ + return vst1q_f32((float32_t*)ptr, v.f4); +} + +void storeAligned(float* ptr, Simd4f const& v) +{ + return vst1q_f32((float32_t*)ptr, v.f4); +} + +void storeAligned(float* ptr, unsigned int offset, Simd4f const& v) +{ + return storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4f splat(Simd4f const& v) +{ + return vdupq_n_f32(array(v)[i]); +} + +Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1) +{ + return vbslq_f32(mask.u4, v0.f4, v1.f4); +} + +Simd4f abs(const Simd4f& v) +{ + return vabsq_f32(v.f4); +} + +Simd4f floor(const Simd4f& v) +{ + int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31)); + return vcvtq_f32_s32(vsubq_s32(vcvtq_s32_f32(v.f4), neg)); +} + +Simd4f max(const Simd4f& v0, const Simd4f& v1) +{ + return vmaxq_f32(v0.f4, v1.f4); +} + +Simd4f min(const Simd4f& v0, const Simd4f& v1) +{ + return vminq_f32(v0.f4, v1.f4); +} + +Simd4f recip(const Simd4f& v) +{ + return recipT<0>(v); +} + +template <int n> +Simd4f recipT(const Simd4f& v) +{ + Simd4f recipV = vrecpeq_f32(v.f4); + // n+1 newton iterations because initial approximation is crude + for(int i = 0; i <= n; ++i) + recipV = vrecpsq_f32(v.f4, recipV.f4) * recipV; + return recipV; +} + +Simd4f sqrt(const Simd4f& v) +{ + return v * rsqrt(v); +} + +Simd4f rsqrt(const Simd4f& v) +{ + return rsqrtT<0>(v); +} + +template <int n> +Simd4f rsqrtT(const Simd4f& v) +{ + Simd4f rsqrtV = vrsqrteq_f32(v.f4); + // n+1 newton iterations because initial approximation is crude + for(int i = 0; i <= n; ++i) + rsqrtV = vrsqrtsq_f32(vmulq_f32(v.f4, rsqrtV.f4), rsqrtV.f4) * rsqrtV; + return rsqrtV; +} + +Simd4f exp2(const Simd4f& v) +{ + // http://www.netlib.org/cephes/ + + Simd4f limit = simd4f(127.4999f); + Simd4f x = min(max(-limit, v), limit); + + // separate into integer and fractional part + + Simd4f fx = x + simd4f(0.5f); + Simd4i ix = vsubq_s32(vcvtq_s32_f32(fx.f4), vreinterpretq_s32_u32(vshrq_n_u32(fx.u4, 31))); + fx = x - vcvtq_f32_s32(ix.i4); + + // exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx)) + + Simd4f fx2 = fx * fx; + + Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) + + fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f))); + Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2); + + Simd4f exp2fx = px * recip(qx - px); + exp2fx = simd4f(_1) + exp2fx + exp2fx; + + // exp2(ix) + + Simd4f exp2ix = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ix.i4, vdupq_n_s32(0x7f)), 23)); + + return exp2fx * exp2ix; +} + +Simd4f log2(const Simd4f& v) +{ + Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2) + const float* ptr = array(v); + return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale; +} + +Simd4f dot3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f tmp = v0 * v1; + return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp); +} + +Simd4f cross3(const Simd4f& v0, const Simd4f& v1) +{ + float32x2_t x0_y0 = vget_low_f32(v0.f4); + float32x2_t z0_w0 = vget_high_f32(v0.f4); + float32x2_t x1_y1 = vget_low_f32(v1.f4); + float32x2_t z1_w1 = vget_high_f32(v1.f4); + + float32x2_t y1_z1 = vext_f32(x1_y1, z1_w1, 1); + float32x2_t y0_z0 = vext_f32(x0_y0, z0_w0, 1); + + float32x2_t z0x1_w0y1 = vmul_f32(z0_w0, x1_y1); + float32x2_t x0y1_y0z1 = vmul_f32(x0_y0, y1_z1); + + float32x2_t y2_w2 = vmls_f32(z0x1_w0y1, x0_y0, z1_w1); + float32x2_t z2_x2 = vmls_f32(x0y1_y0z1, y0_z0, x1_y1); + float32x2_t x2_y2 = vext_f32(z2_x2, y2_w2, 1); + + return vcombine_f32(x2_y2, z2_x2); +} + +void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w) +{ +#if NVMATH_INLINE_ASSEMBLER + asm volatile("vzip.f32 %q0, %q2 \n\t" + "vzip.f32 %q1, %q3 \n\t" + "vzip.f32 %q0, %q1 \n\t" + "vzip.f32 %q2, %q3 \n\t" + : "+w"(x.f4), "+w"(y.f4), "+w"(z.f4), "+w"(w.f4)); +#else + float32x4x2_t v0v1 = vzipq_f32(x.f4, z.f4); + float32x4x2_t v2v3 = vzipq_f32(y.f4, w.f4); + float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]); + float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]); + + x = zip0.val[0]; + y = zip0.val[1]; + z = zip1.val[0]; + w = zip1.val[1]; +#endif +} + +int allEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 == v1); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 == v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 > v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 > v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 >= v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 >= v1); +} + +int allTrue(const Simd4f& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vand.u32 d0, d0, d1 \n\t" + "vpmin.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.f4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) == 0xffffffff; +#endif +} + +int anyTrue(const Simd4f& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vorr.u32 d0, d0, d1 \n\t" + "vpmax.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.f4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) != 0x0; +#endif +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h new file mode 100644 index 00000000..7a566256 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Simd4i() const +{ + return vdupq_n_s32(v); +} + +inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const +{ + return reinterpret_cast<const Simd4i&>(v); +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const +{ + return vdupq_n_u32(i); +} + +template <> +inline Simd4iFactory<const int*>::operator Simd4i() const +{ + return vld1q_s32(v); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const +{ + return vld1q_s32(v.ptr); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const +{ + return vld1q_s32(reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4i>::operator Simd4i() const +{ + return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4); +} + +Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1) +{ + return vceqq_u32(v0.u4, v1.u4); +} + +Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1) +{ + return vcltq_s32(v0.i4, v1.i4); +} + +Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1) +{ + return vcgtq_s32(v0.i4, v1.i4); +} + +ComplementExpr<Simd4i> operator~(const Simd4i& v) +{ + return ComplementExpr<Simd4i>(v); +} + +Simd4i operator&(const Simd4i& v0, const Simd4i& v1) +{ + return vandq_u32(v0.u4, v1.u4); +} + +Simd4i operator|(const Simd4i& v0, const Simd4i& v1) +{ + return vorrq_u32(v0.u4, v1.u4); +} + +Simd4i operator^(const Simd4i& v0, const Simd4i& v1) +{ + return veorq_u32(v0.u4, v1.u4); +} + +Simd4i operator<<(const Simd4i& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(shift)); +} + +Simd4i operator>>(const Simd4i& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(-shift)); +} + +Simd4i operator<<(const Simd4i& v, const Simd4i& shift) +{ + return vshlq_u32(v.u4, shift.i4); +} + +Simd4i operator>>(const Simd4i& v, const Simd4i& shift) +{ + return vshlq_u32(v.u4, vnegq_s32(shift.i4)); +} + +Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1) +{ + return vaddq_u32(v0.u4, v1.u4); +} + +Simd4i simdi::operator-(const Simd4i& v) +{ + return vnegq_s32(v.i4); +} + +Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1) +{ + return vsubq_u32(v0.u4, v1.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simd4i(const Simd4f& v) +{ + return v.u4; +} + +int (&simdi::array(Simd4i& v))[4] +{ + return (int(&)[4])v; +} + +const int (&simdi::array(const Simd4i& v))[4] +{ + return (const int(&)[4])v; +} + +void store(int* ptr, const Simd4i& v) +{ + return vst1q_s32(ptr, v.i4); +} + +void storeAligned(int* ptr, const Simd4i& v) +{ + vst1q_s32(ptr, v.i4); +} + +void storeAligned(int* ptr, unsigned int offset, const Simd4i& v) +{ + return storeAligned(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4i splat(Simd4i const& v) +{ + return vdupq_n_s32(simdi::array(v)[i]); +} + +Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1) +{ + return vbslq_u32(mask.u4, v0.u4, v1.u4); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator==(v0, v1)); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator>(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator>(v0, v1)); +} + +int allTrue(const Simd4i& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vand.u32 d0, d0, d1 \n\t" + "vpmin.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.u4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) == 0xffffffff; +#endif +} + +int anyTrue(const Simd4i& v) +{ +#if NVMATH_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vorr.u32 d0, d0, d1 \n\t" + "vpmax.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.u4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) != 0x0; +#endif +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h new file mode 100644 index 00000000..542fac08 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <arm_neon.h> + +union Simd4f +{ + Simd4f() + { + } + Simd4f(const float32x4_t& v) : f4(v) + { + } +#ifndef _M_ARM // all *32x4_t map to the same type + Simd4f(const uint32x4_t& v) : u4(v) + { + } +#endif + float32x4_t f4; + uint32x4_t u4; + int32x4_t i4; +}; + +union Simd4i +{ + Simd4i() + { + } + Simd4i(const uint32x4_t& v) : u4(v) + { + } +#ifndef _M_ARM // all *32x4_t map to the same type + Simd4i(const int32x4_t& v) : i4(v) + { + } +#endif + uint32x4_t u4; + int32x4_t i4; +}; diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h new file mode 100644 index 00000000..b67f96aa --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifdef _M_ARM +#include <arm_neon.h> +#endif + +namespace nvidia +{ +namespace cloth +{ + +uint32_t findBitSet(uint32_t mask) +{ +#ifdef _M_ARM + __n64 t = { mask }; + return 31 - (vclz_u32(t)).n64_u32[0]; +#else + return 31 - __builtin_clz(mask); +#endif +} + +Simd4i intFloor(const Simd4f& v) +{ + int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31)); + return vsubq_s32(vcvtq_s32_f32(v.f4), neg); +} + +Simd4i horizontalOr(Simd4i mask) +{ + using namespace simdi; + uint32x2_t hi = vget_high_u32(mask.u4); + uint32x2_t lo = vget_low_u32(mask.u4); + uint32x2_t tmp = vorr_u32(lo, hi); + uint32x2_t rev = vrev64_u32(tmp); + uint32x2_t res = vorr_u32(tmp, rev); + return vcombine_u32(res, res); +} + +Gather<Simd4i>::Gather(const Simd4i& index) +{ +#ifdef __arm64__ + using namespace simdi; + PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack); + uint8x16_t lohiIndex = reinterpret_cast<const uint8x16_t&>(index); + byteIndex.val[0] = vtbl1q_u8(lohiIndex, byteIndex.val[0]); + byteIndex.val[1] = vtbl1q_u8(lohiIndex, byteIndex.val[1]); + mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2); + mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4); +#else + using namespace simdi; + PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack); + uint8x8x2_t lohiIndex = reinterpret_cast<const uint8x8x2_t&>(index); + byteIndex.val[0] = vtbl2_u8(lohiIndex, byteIndex.val[0]); + byteIndex.val[1] = vtbl2_u8(lohiIndex, byteIndex.val[1]); + mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2); + mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4); +#endif +} + +Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const +{ +#ifdef __arm64__ + PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute); + const uint8x16x2_t* table = reinterpret_cast<const uint8x16x2_t*>(ptr); + result.val[0] = vtbl2q_u8(*table, result.val[0]); + result.val[1] = vtbl2q_u8(*table, result.val[1]); + return reinterpret_cast<const Simd4i&>(result); +#else + PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute); + const uint8x8x4_t* table = reinterpret_cast<const uint8x8x4_t*>(ptr); + result.val[0] = vtbl4_u8(*table, result.val[0]); + result.val[1] = vtbl4_u8(*table, result.val[1]); + return reinterpret_cast<const Simd4i&>(result); +#endif +} + +} // namespace cloth +} // namespace physx diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h new file mode 100644 index 00000000..d02d5066 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Scalar4f() const +{ + return Scalar4f(v, v, v, v); +} + +inline Simd4fFactory<detail::FourTuple>::operator Scalar4f() const +{ + return reinterpret_cast<const Scalar4f&>(v); +} + +template <int i> +inline Simd4fFactory<detail::IntType<i> >::operator Scalar4f() const +{ + float s = i; + return Scalar4f(s, s, s, s); +} + +template <> +inline Simd4fFactory<detail::IntType<0x80000000u> >::operator Scalar4f() const +{ + int32_t i = 0x80000000u; + return Scalar4f(i, i, i, i); +} + +template <> +inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Scalar4f() const +{ + int32_t i = 0xffffffff; + return Scalar4f(i, i, i, i); +} + +template <> +inline Simd4fFactory<const float*>::operator Scalar4f() const +{ + return Scalar4f(v[0], v[1], v[2], v[3]); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Scalar4f() const +{ + return Scalar4f(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Scalar4f() const +{ + const float* ptr = reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset); + return Scalar4f(ptr[0], ptr[1], ptr[2], ptr[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Scalar4f>::operator Scalar4f() const +{ + return Scalar4f(~v.u4[0], ~v.u4[1], ~v.u4[2], ~v.u4[3]); +} + +inline Scalar4f operator&(const ComplementExpr<Scalar4f>& complement, const Scalar4f& v) +{ + return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2], + v.u4[3] & ~complement.v.u4[3]); +} + +inline Scalar4f operator&(const Scalar4f& v, const ComplementExpr<Scalar4f>& complement) +{ + return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2], + v.u4[3] & ~complement.v.u4[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +inline Scalar4f operator==(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] == v1.f4[0], v0.f4[1] == v1.f4[1], v0.f4[2] == v1.f4[2], v0.f4[3] == v1.f4[3]); +} + +inline Scalar4f operator<(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] < v1.f4[0], v0.f4[1] < v1.f4[1], v0.f4[2] < v1.f4[2], v0.f4[3] < v1.f4[3]); +} + +inline Scalar4f operator<=(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] <= v1.f4[0], v0.f4[1] <= v1.f4[1], v0.f4[2] <= v1.f4[2], v0.f4[3] <= v1.f4[3]); +} + +inline Scalar4f operator>(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] > v1.f4[0], v0.f4[1] > v1.f4[1], v0.f4[2] > v1.f4[2], v0.f4[3] > v1.f4[3]); +} + +inline Scalar4f operator>=(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] >= v1.f4[0], v0.f4[1] >= v1.f4[1], v0.f4[2] >= v1.f4[2], v0.f4[3] >= v1.f4[3]); +} + +inline ComplementExpr<Scalar4f> operator~(const Scalar4f& v) +{ + return ComplementExpr<Scalar4f>(v); +} + +inline Scalar4f operator&(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.u4[0] & v1.u4[0], v0.u4[1] & v1.u4[1], v0.u4[2] & v1.u4[2], v0.u4[3] & v1.u4[3]); +} + +inline Scalar4f operator|(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.u4[0] | v1.u4[0], v0.u4[1] | v1.u4[1], v0.u4[2] | v1.u4[2], v0.u4[3] | v1.u4[3]); +} + +inline Scalar4f operator^(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.u4[0] ^ v1.u4[0], v0.u4[1] ^ v1.u4[1], v0.u4[2] ^ v1.u4[2], v0.u4[3] ^ v1.u4[3]); +} + +inline Scalar4f operator<<(const Scalar4f& v, int shift) +{ + return Scalar4f(v.u4[0] << shift, v.u4[1] << shift, v.u4[2] << shift, v.u4[3] << shift); +} + +inline Scalar4f operator>>(const Scalar4f& v, int shift) +{ + return Scalar4f(v.u4[0] >> shift, v.u4[1] >> shift, v.u4[2] >> shift, v.u4[3] >> shift); +} + +inline Scalar4f operator+(const Scalar4f& v) +{ + return v; +} + +inline Scalar4f operator+(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] + v1.f4[0], v0.f4[1] + v1.f4[1], v0.f4[2] + v1.f4[2], v0.f4[3] + v1.f4[3]); +} + +inline Scalar4f operator-(const Scalar4f& v) +{ + return Scalar4f(-v.f4[0], -v.f4[1], -v.f4[2], -v.f4[3]); +} + +inline Scalar4f operator-(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] - v1.f4[0], v0.f4[1] - v1.f4[1], v0.f4[2] - v1.f4[2], v0.f4[3] - v1.f4[3]); +} + +inline Scalar4f operator*(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] * v1.f4[0], v0.f4[1] * v1.f4[1], v0.f4[2] * v1.f4[2], v0.f4[3] * v1.f4[3]); +} + +inline Scalar4f operator/(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(v0.f4[0] / v1.f4[0], v0.f4[1] / v1.f4[1], v0.f4[2] / v1.f4[2], v0.f4[3] / v1.f4[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +inline Scalar4f simd4f(const Scalar4i& v) +{ + return v; +} + +inline float (&array(Scalar4f& v))[4] +{ + return v.f4; +} + +inline const float (&array(const Scalar4f& v))[4] +{ + return v.f4; +} + +inline void store(float* ptr, const Scalar4f& v) +{ + ptr[0] = v.f4[0]; + ptr[1] = v.f4[1]; + ptr[2] = v.f4[2]; + ptr[3] = v.f4[3]; +} + +inline void storeAligned(float* ptr, const Scalar4f& v) +{ + store(ptr, v); +} + +inline void storeAligned(float* ptr, unsigned int offset, const Scalar4f& v) +{ + storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +inline Scalar4f splat(const Scalar4f& v) +{ + return Scalar4f(v.f4[i], v.f4[i], v.f4[i], v.f4[i]); +} + +inline Scalar4f select(const Scalar4f& mask, const Scalar4f& v0, const Scalar4f& v1) +{ + return ((v0 ^ v1) & mask) ^ v1; +} + +inline Scalar4f abs(const Scalar4f& v) +{ + return Scalar4f(::fabsf(v.f4[0]), ::fabsf(v.f4[1]), ::fabsf(v.f4[2]), ::fabsf(v.f4[3])); +} + +inline Scalar4f floor(const Scalar4f& v) +{ + return Scalar4f(::floorf(v.f4[0]), ::floorf(v.f4[1]), ::floorf(v.f4[2]), ::floorf(v.f4[3])); +} + +inline Scalar4f max(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(std::max(v0.f4[0], v1.f4[0]), std::max(v0.f4[1], v1.f4[1]), std::max(v0.f4[2], v1.f4[2]), + std::max(v0.f4[3], v1.f4[3])); +} + +inline Scalar4f min(const Scalar4f& v0, const Scalar4f& v1) +{ + return Scalar4f(std::min(v0.f4[0], v1.f4[0]), std::min(v0.f4[1], v1.f4[1]), std::min(v0.f4[2], v1.f4[2]), + std::min(v0.f4[3], v1.f4[3])); +} + +inline Scalar4f recip(const Scalar4f& v) +{ + return Scalar4f(1 / v.f4[0], 1 / v.f4[1], 1 / v.f4[2], 1 / v.f4[3]); +} + +template <int n> +inline Scalar4f recipT(const Scalar4f& v) +{ + return recip(v); +} + +inline Scalar4f sqrt(const Scalar4f& v) +{ + return Scalar4f(::sqrtf(v.f4[0]), ::sqrtf(v.f4[1]), ::sqrtf(v.f4[2]), ::sqrtf(v.f4[3])); +} + +inline Scalar4f rsqrt(const Scalar4f& v) +{ + return recip(sqrt(v)); +} + +template <int n> +inline Scalar4f rsqrtT(const Scalar4f& v) +{ + return rsqrt(v); +} + +inline Scalar4f exp2(const Scalar4f& v) +{ + float scale = 0.69314718055994531f; // ::logf(2.0f); + return Scalar4f(::expf(v.f4[0] * scale), ::expf(v.f4[1] * scale), ::expf(v.f4[2] * scale), ::expf(v.f4[3] * scale)); +} + +namespace simdf +{ +// PSP2 is confused resolving about exp2, forwarding works +inline Scalar4f exp2(const Scalar4f& v) +{ + return ::exp2(v); +} +} + +inline Scalar4f log2(const Scalar4f& v) +{ + float scale = 1.44269504088896341f; // 1/ln(2) + return Scalar4f(::logf(v.f4[0]) * scale, ::logf(v.f4[1]) * scale, ::logf(v.f4[2]) * scale, ::logf(v.f4[3]) * scale); +} + +inline Scalar4f dot3(const Scalar4f& v0, const Scalar4f& v1) +{ + return simd4f(v0.f4[0] * v1.f4[0] + v0.f4[1] * v1.f4[1] + v0.f4[2] * v1.f4[2]); +} + +inline Scalar4f cross3(const Scalar4f& v0, const Scalar4f& v1) +{ + return simd4f(v0.f4[1] * v1.f4[2] - v0.f4[2] * v1.f4[1], v0.f4[2] * v1.f4[0] - v0.f4[0] * v1.f4[2], + v0.f4[0] * v1.f4[1] - v0.f4[1] * v1.f4[0], 0.0f); +} + +inline void transpose(Scalar4f& x, Scalar4f& y, Scalar4f& z, Scalar4f& w) +{ + float x1 = x.f4[1], x2 = x.f4[2], x3 = x.f4[3]; + float y2 = y.f4[2], y3 = y.f4[3], z3 = z.f4[3]; + + x.f4[1] = y.f4[0]; + x.f4[2] = z.f4[0]; + x.f4[3] = w.f4[0]; + y.f4[0] = x1; + y.f4[2] = z.f4[1]; + y.f4[3] = w.f4[1]; + z.f4[0] = x2; + z.f4[1] = y2; + z.f4[3] = w.f4[2]; + w.f4[0] = x3; + w.f4[1] = y3; + w.f4[2] = z3; +} + +inline int allEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] == v1.f4[0] && v0.f4[1] == v1.f4[1] && v0.f4[2] == v1.f4[2] && v0.f4[3] == v1.f4[3]; +} + +inline int allEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] == v1.f4[0] || v0.f4[1] == v1.f4[1] || v0.f4[2] == v1.f4[2] || v0.f4[3] == v1.f4[3]; +} + +inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allGreater(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] > v1.f4[0] && v0.f4[1] > v1.f4[1] && v0.f4[2] > v1.f4[2] && v0.f4[3] > v1.f4[3]; +} + +inline int allGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] > v1.f4[0] || v0.f4[1] > v1.f4[1] || v0.f4[2] > v1.f4[2] || v0.f4[3] > v1.f4[3]; +} + +inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] >= v1.f4[0] && v0.f4[1] >= v1.f4[1] && v0.f4[2] >= v1.f4[2] && v0.f4[3] >= v1.f4[3]; +} + +inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1) +{ + return v0.f4[0] >= v1.f4[0] || v0.f4[1] >= v1.f4[1] || v0.f4[2] >= v1.f4[2] || v0.f4[3] >= v1.f4[3]; +} + +inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask) +{ + bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allTrue(const Scalar4f& v) +{ + return v.u4[0] & v.u4[1] & v.u4[2] & v.u4[3]; +} + +inline int anyTrue(const Scalar4f& v) +{ + return v.u4[0] | v.u4[1] | v.u4[2] | v.u4[3]; +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h new file mode 100644 index 00000000..80ac2abd --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Scalar4i() const +{ + return Scalar4i(v, v, v, v); +} + +inline Simd4iFactory<detail::FourTuple>::operator Scalar4i() const +{ + return reinterpret_cast<const Scalar4i&>(v); +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Scalar4i() const +{ + return Scalar4i(i, i, i, i); +} + +template <> +inline Simd4iFactory<const int*>::operator Scalar4i() const +{ + return Scalar4i(v[0], v[1], v[2], v[3]); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Scalar4i() const +{ + return Scalar4i(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Scalar4i() const +{ + const int* ptr = reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset); + return Scalar4i(ptr[0], ptr[1], ptr[2], ptr[3]); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +namespace simdi +{ + +inline Scalar4i operator==(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] == v1.i4[0], v0.i4[1] == v1.i4[1], v0.i4[2] == v1.i4[2], v0.i4[3] == v1.i4[3]); +} + +inline Scalar4i operator<(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] < v1.i4[0], v0.i4[1] < v1.i4[1], v0.i4[2] < v1.i4[2], v0.i4[3] < v1.i4[3]); +} + +inline Scalar4i operator>(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] > v1.i4[0], v0.i4[1] > v1.i4[1], v0.i4[2] > v1.i4[2], v0.i4[3] > v1.i4[3]); +} + +inline Scalar4i operator+(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] + v1.i4[0], v0.i4[1] + v1.i4[1], v0.i4[2] + v1.i4[2], v0.i4[3] + v1.i4[3]); +} + +inline Scalar4i operator-(const Scalar4i& v) +{ + return Scalar4i(-v.i4[0], -v.i4[1], -v.i4[2], -v.i4[3]); +} + +inline Scalar4i operator-(const Scalar4i& v0, const Scalar4i& v1) +{ + return Scalar4i(v0.i4[0] - v1.i4[0], v0.i4[1] - v1.i4[1], v0.i4[2] - v1.i4[2], v0.i4[3] - v1.i4[3]); +} + +} // namespace simd + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +inline Scalar4i simd4i(const Scalar4f& v) +{ + return v; +} + +namespace simdi +{ + +inline int (&array(Scalar4i& v))[4] +{ + return v.i4; +} + +inline const int (&array(const Scalar4i& v))[4] +{ + return v.i4; +} + +} // namespace simdi + +inline void store(int* ptr, const Scalar4i& v) +{ + ptr[0] = v.i4[0]; + ptr[1] = v.i4[1]; + ptr[2] = v.i4[2]; + ptr[3] = v.i4[3]; +} + +inline void storeAligned(int* ptr, const Scalar4i& v) +{ + store(ptr, v); +} + +inline void storeAligned(int* ptr, unsigned int offset, const Scalar4i& v) +{ + store(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +namespace simdi +{ + +inline int allEqual(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] == v1.i4[0] && v0.i4[1] == v1.i4[1] && v0.i4[2] == v1.i4[2] && v0.i4[3] == v1.i4[3]; +} + +inline int allEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] == v1.i4[0] || v0.i4[1] == v1.i4[1] || v0.i4[2] == v1.i4[2] || v0.i4[3] == v1.i4[3]; +} + +inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +inline int allGreater(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] > v1.i4[0] && v0.i4[1] > v1.i4[1] && v0.i4[2] > v1.i4[2] && v0.i4[3] > v1.i4[3]; +} + +inline int allGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 && b1 && b2 && b3; +} + +inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1) +{ + return v0.i4[0] > v1.i4[0] || v0.i4[1] > v1.i4[1] || v0.i4[2] > v1.i4[2] || v0.i4[3] > v1.i4[3]; +} + +inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask) +{ + bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3]; + outMask = Scalar4f(b0, b1, b2, b3); + return b0 || b1 || b2 || b3; +} + +} // namespace simd diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h new file mode 100644 index 00000000..a287766c --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifdef PX_WIIU +#pragma ghs nowarning 193 // warning #193-D: zero used for undefined preprocessing identifier +#endif + +#include <algorithm> + +#ifdef PX_WIIU +#pragma ghs endnowarning +#endif + +union Scalar4f +{ + Scalar4f() + { + } + + Scalar4f(float x, float y, float z, float w) + { + f4[0] = x; + f4[1] = y; + f4[2] = z; + f4[3] = w; + } + + Scalar4f(int32_t x, int32_t y, int32_t z, int32_t w) + { + i4[0] = x; + i4[1] = y; + i4[2] = z; + i4[3] = w; + } + + Scalar4f(uint32_t x, uint32_t y, uint32_t z, uint32_t w) + { + u4[0] = x; + u4[1] = y; + u4[2] = z; + u4[3] = w; + } + + Scalar4f(bool x, bool y, bool z, bool w) + { + u4[0] = ~(uint32_t(x) - 1); + u4[1] = ~(uint32_t(y) - 1); + u4[2] = ~(uint32_t(z) - 1); + u4[3] = ~(uint32_t(w) - 1); + } + + Scalar4f(const Scalar4f& other) + { + u4[0] = other.u4[0]; + u4[1] = other.u4[1]; + u4[2] = other.u4[2]; + u4[3] = other.u4[3]; + } + + Scalar4f& operator=(const Scalar4f& other) + { + u4[0] = other.u4[0]; + u4[1] = other.u4[1]; + u4[2] = other.u4[2]; + u4[3] = other.u4[3]; + return *this; + } + + float f4[4]; + int32_t i4[4]; + uint32_t u4[4]; +}; + +typedef Scalar4f Scalar4i; diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h new file mode 100644 index 00000000..33b35f72 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +namespace nvidia +{ +namespace cloth +{ + +#if !NVMATH_SIMD +uint32_t findBitSet(uint32_t mask) +{ + uint32_t result = 0; + while(mask >>= 1) + ++result; + return result; +} +#endif + +inline Scalar4i intFloor(const Scalar4f& v) +{ + return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3]))); +} + +inline Scalar4i horizontalOr(Scalar4i mask) +{ + return simd4i(mask.i4[0] | mask.i4[1] | mask.i4[2] | mask.i4[3]); +} + +template <> +struct Gather<Scalar4i> +{ + inline Gather(const Scalar4i& index); + inline Scalar4i operator()(const Scalar4i*) const; + + Scalar4i mIndex; + Scalar4i mOutOfRange; +}; + +Gather<Scalar4i>::Gather(const Scalar4i& index) +{ + uint32_t mask = physx::cloth::SwCollision<Scalar4i>::sGridSize - 1; + + mIndex.u4[0] = index.u4[0] & mask; + mIndex.u4[1] = index.u4[1] & mask; + mIndex.u4[2] = index.u4[2] & mask; + mIndex.u4[3] = index.u4[3] & mask; + + mOutOfRange.u4[0] = index.u4[0] & ~mask ? 0 : -1; + mOutOfRange.u4[1] = index.u4[1] & ~mask ? 0 : -1; + mOutOfRange.u4[2] = index.u4[2] & ~mask ? 0 : -1; + mOutOfRange.u4[3] = index.u4[3] & ~mask ? 0 : -1; +} + +Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const +{ + const int32_t* base = ptr->i4; + const int32_t* index = mIndex.i4; + const int32_t* mask = mOutOfRange.i4; + return Scalar4i(base[index[0]] & mask[0], base[index[1]] & mask[1], base[index[2]] & mask[2], + base[index[3]] & mask[3]); +} + +} // namespace cloth +} // namespace physx diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h new file mode 100644 index 00000000..3f04750f --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4fFactory<const float&>::operator Simd4f() const +{ + return _mm_set1_ps(v); +} + +inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const +{ + return reinterpret_cast<const Simd4f&>(v); +} + +template <> +inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const +{ + return _mm_setzero_ps(); +} + +template <> +inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const +{ + return _mm_set1_ps(1.0f); +} + +template <> +inline Simd4fFactory<detail::IntType<int(0x80000000)> >::operator Simd4f() const +{ + return _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); +} + +template <> +inline Simd4fFactory<detail::IntType<int(0xffffffff)> >::operator Simd4f() const +{ + return _mm_castsi128_ps(_mm_set1_epi32(-1)); +} + +template <> +inline Simd4fFactory<const float*>::operator Simd4f() const +{ + return _mm_loadu_ps(v); +} + +template <> +inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const +{ + return _mm_load_ps(v.ptr); +} + +template <> +inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const +{ + return _mm_load_ps(reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4f>::operator Simd4f() const +{ + return _mm_andnot_ps(v, _mm_castsi128_ps(_mm_set1_epi32(-1))); +} + +Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v) +{ + return _mm_andnot_ps(complement.v, v); +} + +Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement) +{ + return _mm_andnot_ps(complement.v, v); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f operator==(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmpeq_ps(v0, v1); +} + +Simd4f operator<(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmplt_ps(v0, v1); +} + +Simd4f operator<=(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmple_ps(v0, v1); +} + +Simd4f operator>(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmpgt_ps(v0, v1); +} + +Simd4f operator>=(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_cmpge_ps(v0, v1); +} + +ComplementExpr<Simd4f> operator~(const Simd4f& v) +{ + return ComplementExpr<Simd4f>(v); +} + +Simd4f operator&(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_and_ps(v0, v1); +} + +Simd4f operator|(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_or_ps(v0, v1); +} + +Simd4f operator^(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_xor_ps(v0, v1); +} + +Simd4f operator<<(const Simd4f& v, int shift) +{ + return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(v), shift)); +} + +Simd4f operator>>(const Simd4f& v, int shift) +{ + return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(v), shift)); +} + +Simd4f operator+(const Simd4f& v) +{ + return v; +} + +Simd4f operator+(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_add_ps(v0, v1); +} + +Simd4f operator-(const Simd4f& v) +{ + return _mm_sub_ps(_mm_setzero_ps(), v); +} + +Simd4f operator-(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_sub_ps(v0, v1); +} + +Simd4f operator*(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_mul_ps(v0, v1); +} + +Simd4f operator/(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_div_ps(v0, v1); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f simd4f(const Simd4i& v) +{ + return _mm_castsi128_ps(v); +} + +float (&array(Simd4f& v))[4] +{ + return reinterpret_cast<float(&)[4]>(v); +} + +const float (&array(const Simd4f& v))[4] +{ + return reinterpret_cast<const float(&)[4]>(v); +} + +void store(float* ptr, Simd4f const& v) +{ + _mm_storeu_ps(ptr, v); +} + +void storeAligned(float* ptr, Simd4f const& v) +{ + _mm_store_ps(ptr, v); +} + +void storeAligned(float* ptr, unsigned int offset, Simd4f const& v) +{ + _mm_store_ps(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4f splat(Simd4f const& v) +{ + return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)); +} + +Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1) +{ + return _mm_xor_ps(v1, _mm_and_ps(mask, _mm_xor_ps(v1, v0))); +} + +Simd4f abs(const Simd4f& v) +{ + return _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), v); +} + +Simd4f floor(const Simd4f& v) +{ + // SSE 4.1: return _mm_floor_ps(v); + Simd4i i = _mm_cvttps_epi32(v); + return _mm_cvtepi32_ps(_mm_sub_epi32(i, _mm_srli_epi32(i, 31))); +} + +Simd4f max(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_max_ps(v0, v1); +} + +Simd4f min(const Simd4f& v0, const Simd4f& v1) +{ + return _mm_min_ps(v0, v1); +} + +Simd4f recip(const Simd4f& v) +{ + return _mm_rcp_ps(v); +} + +template <int n> +Simd4f recipT(const Simd4f& v) +{ + Simd4f two = simd4f(2.0f); + Simd4f recipV = recip(v); + for(int i = 0; i < n; ++i) + recipV = recipV * (two - v * recipV); + return recipV; +} + +Simd4f sqrt(const Simd4f& v) +{ + return _mm_sqrt_ps(v); +} + +Simd4f rsqrt(const Simd4f& v) +{ + return _mm_rsqrt_ps(v); +} + +template <int n> +Simd4f rsqrtT(const Simd4f& v) +{ + Simd4f halfV = v * simd4f(0.5f); + Simd4f threeHalf = simd4f(1.5f); + Simd4f rsqrtV = rsqrt(v); + for(int i = 0; i < n; ++i) + rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV); + return rsqrtV; +} + +Simd4f exp2(const Simd4f& v) +{ + // http://www.netlib.org/cephes/ + + Simd4f limit = simd4f(127.4999f); + Simd4f x = min(max(-limit, v), limit); + + // separate into integer and fractional part + + Simd4f fx = x + simd4f(0.5f); + Simd4i ix = _mm_sub_epi32(_mm_cvttps_epi32(fx), _mm_srli_epi32(_mm_castps_si128(fx), 31)); + fx = x - Simd4f(_mm_cvtepi32_ps(ix)); + + // exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx)) + + Simd4f fx2 = fx * fx; + + Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) + + fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f))); + Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2); + + Simd4f exp2fx = px * recip(qx - px); + exp2fx = simd4f(_1) + exp2fx + exp2fx; + + // exp2(ix) + + Simd4f exp2ix = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix, _mm_set1_epi32(0x7f)), 23)); + + return exp2fx * exp2ix; +} + +Simd4f log2(const Simd4f& v) +{ + // todo: fast approximate implementation like exp2 + Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2) + const float* ptr = array(v); + return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale; +} + +Simd4f dot3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f tmp = v0 * v1; + return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp); +} + +Simd4f cross3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f t0 = _mm_shuffle_ps(v0, v0, 0xc9); // w z y x -> w x z y + Simd4f t1 = _mm_shuffle_ps(v1, v1, 0xc9); + Simd4f tmp = v0 * t1 - t0 * v1; + return _mm_shuffle_ps(tmp, tmp, 0xc9); +} + +void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w) +{ + _MM_TRANSPOSE4_PS(x, y, z, w); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 == v1); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 == v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 > v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 > v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 >= v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 >= v1); +} + +int allTrue(const Simd4f& v) +{ + return _mm_movemask_ps(v) == 0xf; +} + +int anyTrue(const Simd4f& v) +{ + return _mm_movemask_ps(v); +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h new file mode 100644 index 00000000..d4a70a02 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline Simd4iFactory<const int&>::operator Simd4i() const +{ + return _mm_set1_epi32(v); +} + +inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const +{ + return reinterpret_cast<const Simd4i&>(v); +} + +template <int i> +inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const +{ + return _mm_set1_epi32(i); +} + +template <> +inline Simd4iFactory<detail::IntType<0> >::operator Simd4i() const +{ + return _mm_setzero_si128(); +} + +template <> +inline Simd4iFactory<const int*>::operator Simd4i() const +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(v)); +} + +template <> +inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const +{ + return _mm_load_si128(reinterpret_cast<const __m128i*>(v.ptr)); +} + +template <> +inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const +{ + return _mm_load_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const char*>(v.ptr) + v.offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4i>::operator Simd4i() const +{ + return _mm_andnot_si128(v, _mm_set1_epi32(0xffffffff)); +} + +Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v) +{ + return _mm_andnot_si128(complement.v, v); +} + +Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement) +{ + return _mm_andnot_si128(complement.v, v); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_cmpeq_epi32(v0, v1); +} + +Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_cmplt_epi32(v0, v1); +} + +Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_cmpgt_epi32(v0, v1); +} + +ComplementExpr<Simd4i> operator~(const Simd4i& v) +{ + return ComplementExpr<Simd4i>(v); +} + +Simd4i operator&(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_and_si128(v0, v1); +} + +Simd4i operator|(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_or_si128(v0, v1); +} + +Simd4i operator^(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_xor_si128(v0, v1); +} + +Simd4i operator<<(const Simd4i& v, int shift) +{ + return _mm_slli_epi32(v, shift); +} + +Simd4i operator>>(const Simd4i& v, int shift) +{ + return _mm_srli_epi32(v, shift); +} + +Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_add_epi32(v0, v1); +} + +Simd4i simdi::operator-(const Simd4i& v) +{ + return _mm_sub_epi32(_mm_setzero_si128(), v); +} + +Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1) +{ + return _mm_sub_epi32(v0, v1); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simd4i(const Simd4f& v) +{ + return _mm_castps_si128(v); +} + +int (&simdi::array(Simd4i& v))[4] +{ + return reinterpret_cast<int(&)[4]>(v); +} + +const int (&simdi::array(const Simd4i& v))[4] +{ + return reinterpret_cast<const int(&)[4]>(v); +} + +void store(int* ptr, const Simd4i& v) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), v); +} + +void storeAligned(int* ptr, const Simd4i& v) +{ + _mm_store_si128(reinterpret_cast<__m128i*>(ptr), v); +} + +void storeAligned(int* ptr, unsigned int offset, const Simd4i& v) +{ + _mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4i splat(const Simd4i& v) +{ + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i, i, i, i)); +} + +Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1) +{ + return _mm_xor_si128(v1, _mm_and_si128(mask, _mm_xor_si128(v1, v0))); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator==(v0, v1)); +} + +int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator==(v0, v1)); +} + +int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator==(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(simdi::operator>(v0, v1)); +} + +int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(simdi::operator>(v0, v1)); +} + +int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = simdi::operator>(v0, v1)); +} + +int allTrue(const Simd4i& v) +{ + return _mm_movemask_ps(_mm_castsi128_ps(v)) == 0xf; +} + +int anyTrue(const Simd4i& v) +{ + return _mm_movemask_ps(_mm_castsi128_ps(v)); +} diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h new file mode 100644 index 00000000..e54edde7 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +// SSE + SSE2 (don't include intrin.h!) +#include <emmintrin.h> + +#if defined(_MSC_VER) + +typedef __m128 Simd4f; +typedef __m128i Simd4i; + +#else + +struct Simd4f +{ + Simd4f() + { + } + Simd4f(__m128 x) : m128(x) + { + } + + operator __m128&() + { + return m128; + } + operator const __m128&() const + { + return m128; + } + + private: + __m128 m128; +}; + +struct Simd4i +{ + Simd4i() + { + } + Simd4i(__m128i x) : m128i(x) + { + } + + operator __m128i&() + { + return m128i; + } + operator const __m128i&() const + { + return m128i; + } + + private: + __m128i m128i; +}; + +#endif diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h new file mode 100644 index 00000000..0750fcf5 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifdef PX_GCC_FAMILY +#include <xmmintrin.h> // _BitScanForward +#else +#pragma warning(push) +#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' +#include <intrin.h> // _BitScanForward +#pragma warning(pop) +#endif + +namespace nvidia +{ +namespace cloth +{ + +uint32_t findBitSet(uint32_t mask) +{ +#if defined(_MSC_VER) + unsigned long result; + _BitScanForward(&result, unsigned long(mask)); + return result; +#else + return __builtin_ffs(mask) - 1; +#endif +} + +Simd4i intFloor(const Simd4f& v) +{ + Simd4i i = _mm_cvttps_epi32(v); + return simdi::operator-(i, _mm_srli_epi32(simd4i(v), 31)); +} + +Simd4i horizontalOr(Simd4i mask) +{ + Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y + return tmp | _mm_shuffle_epi32(tmp, 0x4e); // w z y x -> y x w z +} + +Gather<Simd4i>::Gather(const Simd4i& index) +{ + mSelectQ = _mm_srai_epi32(index << 29, 31); + mSelectD = _mm_srai_epi32(index << 30, 31); + mSelectW = _mm_srai_epi32(index << 31, 31); + mOutOfRange = simdi::operator>(index ^ sIntSignBit, sSignedMask); +} + +Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const +{ + // more efficient with _mm_shuffle_epi8 (SSSE3) + Simd4i lo = ptr[0], hi = ptr[1]; + Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo)); + Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo)); + Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi)); + Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi)); + Simd4i m0123 = select(mSelectD, m23, m01); + Simd4i m4567 = select(mSelectD, m67, m45); + return select(mSelectQ, m4567, m0123) & ~mOutOfRange; +} + +} // namespace cloth +} // namespace nvidia diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h new file mode 100644 index 00000000..382812bb --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma warning(push) +#pragma warning(disable:4127) // Disable the nag warning 'conditional expression is constant' + +template <bool useMultiplier> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + + __m128 stretchLimit, compressionLimit, multiplier; + if(useMultiplier) + { + stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff); + compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa); + multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55); + } + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + for(; rIt != rEnd; rIt += 4, iIt += 8) + { + float* p0i = posIt + iIt[0] * 4; + float* p0j = posIt + iIt[1] * 4; + float* p1i = posIt + iIt[2] * 4; + float* p1j = posIt + iIt[3] * 4; + float* p2i = posIt + iIt[4] * 4; + float* p2j = posIt + iIt[5] * 4; + float* p3i = posIt + iIt[6] * 4; + float* p3j = posIt + iIt[7] * 4; + + __m128 v0i = _mm_load_ps(p0i); + __m128 v0j = _mm_load_ps(p0j); + __m128 v1i = _mm_load_ps(p1i); + __m128 v1j = _mm_load_ps(p1j); + __m128 v2i = _mm_load_ps(p2i); + __m128 v2j = _mm_load_ps(p2j); + __m128 v3i = _mm_load_ps(p3i); + __m128 v3j = _mm_load_ps(p3j); + + __m128 h0ij = _mm_add_ps(v0j, _mm_mul_ps(v0i, sMinusOneXYZOneW)); + __m128 h1ij = _mm_add_ps(v1j, _mm_mul_ps(v1i, sMinusOneXYZOneW)); + __m128 h2ij = _mm_add_ps(v2j, _mm_mul_ps(v2i, sMinusOneXYZOneW)); + __m128 h3ij = _mm_add_ps(v3j, _mm_mul_ps(v3i, sMinusOneXYZOneW)); + + __m128 a = _mm_unpacklo_ps(h0ij, h2ij); + __m128 b = _mm_unpackhi_ps(h0ij, h2ij); + __m128 c = _mm_unpacklo_ps(h1ij, h3ij); + __m128 d = _mm_unpackhi_ps(h1ij, h3ij); + + __m128 hxij = _mm_unpacklo_ps(a, c); + __m128 hyij = _mm_unpackhi_ps(a, c); + __m128 hzij = _mm_unpacklo_ps(b, d); + __m128 vwij = _mm_unpackhi_ps(b, d); + + __m128 rij = _mm_load_ps(rIt); + __m128 e2ij = _mm_add_ps( + sEpsilon, _mm_add_ps(_mm_mul_ps(hxij, hxij), _mm_add_ps(_mm_mul_ps(hyij, hyij), _mm_mul_ps(hzij, hzij)))); + __m128 mask = _mm_cmpnle_ps(rij, sEpsilon); + __m128 erij = _mm_and_ps(_mm_sub_ps(sOne, _mm_mul_ps(rij, _mm_rsqrt_ps(e2ij))), mask); + + if(useMultiplier) + { + erij = _mm_sub_ps(erij, _mm_mul_ps(multiplier, _mm_max_ps(compressionLimit, _mm_min_ps(erij, stretchLimit)))); + } + __m128 exij = _mm_mul_ps(erij, _mm_mul_ps(stiffness, _mm_rcp_ps(_mm_add_ps(sEpsilon, vwij)))); + + __m128 exlo = _mm_and_ps(sMaskXY, exij); + __m128 exhi = _mm_andnot_ps(sMaskXY, exij); + + __m128 f0ij = _mm_mul_ps(h0ij, _mm_shuffle_ps(exlo, exlo, 0xc0)); + __m128 f1ij = _mm_mul_ps(h1ij, _mm_shuffle_ps(exlo, exlo, 0xd5)); + __m128 f2ij = _mm_mul_ps(h2ij, _mm_shuffle_ps(exhi, exhi, 0x2a)); + __m128 f3ij = _mm_mul_ps(h3ij, _mm_shuffle_ps(exhi, exhi, 0x3f)); + + __m128 u0i = _mm_add_ps(v0i, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0i, v0i, 0xff))); + __m128 u0j = _mm_sub_ps(v0j, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0j, v0j, 0xff))); + __m128 u1i = _mm_add_ps(v1i, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1i, v1i, 0xff))); + __m128 u1j = _mm_sub_ps(v1j, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1j, v1j, 0xff))); + __m128 u2i = _mm_add_ps(v2i, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2i, v2i, 0xff))); + __m128 u2j = _mm_sub_ps(v2j, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2j, v2j, 0xff))); + __m128 u3i = _mm_add_ps(v3i, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3i, v3i, 0xff))); + __m128 u3j = _mm_sub_ps(v3j, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3j, v3j, 0xff))); + + _mm_store_ps(p0i, u0i); + _mm_store_ps(p0j, u0j); + _mm_store_ps(p1i, u1i); + _mm_store_ps(p1j, u1j); + _mm_store_ps(p2i, u2i); + _mm_store_ps(p2j, u2j); + _mm_store_ps(p3i, u3i); + _mm_store_ps(p3j, u3j); + } +} + +#if PX_X86 + +// clang-format:disable + +// asm blocks in static condition blocks don't get removed, specialize +template <> +void solveConstraints<false>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + __m128 htmp[4]; + float* ptmp[8]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */ + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */ + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */ + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */ + + movaps xmm7, sMinusOneXYZOneW + mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */ + mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */ + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */ + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */ + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */ + + mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */ + mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */ + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + + unpcklps xmm0, xmm2 /* a */ + unpckhps xmm4, xmm2 /* b */ + unpcklps xmm1, xmm3 /* c */ + unpckhps xmm5, xmm3 /* d */ + + movaps xmm2, xmm0 + movaps xmm6, xmm4 + + unpcklps xmm0, xmm1 /* hxij */ + unpckhps xmm2, xmm1 /* hyij */ + unpcklps xmm4, xmm5 /* hzij */ + unpckhps xmm6, xmm5 /* vwij */ + + movaps xmm7, sEpsilon + movaps xmm5, sOne + movaps xmm3, stiffness + movaps xmm1, XMMWORD PTR [edx] /* rij */ + + mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */ + mulps xmm2, xmm2 __asm addps xmm0, xmm2 + mulps xmm4, xmm4 __asm addps xmm0, xmm4 + + rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */ + cmpnleps xmm1, xmm7 /* mask */ + subps xmm5, xmm0 __asm andps xmm5, xmm1 + addps xmm6, xmm7 __asm rcpps xmm6, xmm6 + + mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */ + + movaps xmm7, sMaskXY + andps xmm7, xmm6 /* exlo */ + xorps xmm6, xmm7 /* exhi */ + + movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */ + movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */ + movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */ + movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */ + + pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */ + pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */ + pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */ + pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */ + + mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + add eax, 16 + add edx, 16 + + cmp edx, esi + jb forBegin +forEnd: + } +} + +template <> +void solveConstraints<true>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + __m128 stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff); + __m128 compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa); + __m128 multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55); + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + __m128 htmp[4]; + float* ptmp[8]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */ + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */ + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */ + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */ + + movaps xmm7, sMinusOneXYZOneW + mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */ + mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */ + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */ + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */ + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */ + + mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */ + mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */ + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + + unpcklps xmm0, xmm2 /* a */ + unpckhps xmm4, xmm2 /* b */ + unpcklps xmm1, xmm3 /* c */ + unpckhps xmm5, xmm3 /* d */ + + movaps xmm2, xmm0 + movaps xmm6, xmm4 + + unpcklps xmm0, xmm1 /* hxij */ + unpckhps xmm2, xmm1 /* hyij */ + unpcklps xmm4, xmm5 /* hzij */ + unpckhps xmm6, xmm5 /* vwij */ + + movaps xmm7, sEpsilon + movaps xmm5, sOne + movaps xmm3, stiffness + movaps xmm1, XMMWORD PTR [edx] /* rij */ + + mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */ + mulps xmm2, xmm2 __asm addps xmm0, xmm2 + mulps xmm4, xmm4 __asm addps xmm0, xmm4 + + rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */ + cmpnleps xmm1, xmm7 /* mask */ + subps xmm5, xmm0 __asm andps xmm5, xmm1 + addps xmm6, xmm7 __asm rcpps xmm6, xmm6 + + movaps xmm0, stretchLimit /* multiplier block */ + movaps xmm1, compressionLimit + movaps xmm2, multiplier + minps xmm0, xmm5 + maxps xmm1, xmm0 + mulps xmm2, xmm1 + subps xmm5, xmm2 + + mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */ + + movaps xmm7, sMaskXY + andps xmm7, xmm6 /* exlo */ + xorps xmm6, xmm7 /* exhi */ + + movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */ + movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */ + movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */ + movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */ + + pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */ + pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */ + pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */ + pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */ + + mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + add eax, 16 + add edx, 16 + + cmp edx, esi + jb forBegin +forEnd: + } +} + +// clang-format:enable + +#endif + +#pragma warning(pop) diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h new file mode 100644 index 00000000..59cec2d9 --- /dev/null +++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Factory.h" +#include "Allocator.h" + +namespace physx +{ + class PxCudaContextManager; +} + +namespace nvidia +{ +namespace cloth +{ + +class CuFabric; +class CuCloth; +template <typename> +class ClothImpl; + +class CuFactory : public UserAllocated, public Factory +{ + protected: + CuFactory& operator=(const CuFactory&); + + public: + typedef CuFabric FabricType; + typedef ClothImpl<CuCloth> ImplType; + + CuFactory(PxCudaContextManager*); + virtual ~CuFactory(); + + virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths); + + virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric); + + virtual Solver* createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr); + + virtual Cloth* clone(const Cloth& cloth); + + virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths) const; + + virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const; + + virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const; + + virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const; + + virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const; + + virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const; + + public: + void copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const; + + public: + Vector<CuFabric*>::Type mFabrics; + + PxCudaContextManager* mContextManager; + + uint32_t mNumThreadsPerBlock; + + const uint32_t mMaxThreadsPerBlock; +}; +} +} |