diff options
Diffstat (limited to 'NvCloth/src')
26 files changed, 404 insertions, 236 deletions
diff --git a/NvCloth/src/BoundingBox.h b/NvCloth/src/BoundingBox.h index 74bc0ff..ea84d52 100644 --- a/NvCloth/src/BoundingBox.h +++ b/NvCloth/src/BoundingBox.h @@ -37,26 +37,26 @@ namespace nv namespace cloth { -template <typename Simd4f> +template <typename T4f> struct BoundingBox { - Simd4f mLower; - Simd4f mUpper; + T4f mLower; + T4f mUpper; }; -template <typename Simd4f> -inline BoundingBox<Simd4f> loadBounds(const float* ptr) +template <typename T4f> +inline BoundingBox<T4f> loadBounds(const float* ptr) { - BoundingBox<Simd4f> result; + BoundingBox<T4f> result; result.mLower = load(ptr); result.mUpper = load(ptr + 3); return result; } -template <typename Simd4f> -inline BoundingBox<Simd4f> emptyBounds() +template <typename T4f> +inline BoundingBox<T4f> emptyBounds() { - BoundingBox<Simd4f> result; + BoundingBox<T4f> result; result.mLower = gSimd4fFloatMax; result.mUpper = -result.mLower; @@ -64,10 +64,10 @@ inline BoundingBox<Simd4f> emptyBounds() return result; } -template <typename Simd4f> -inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd) +template <typename T4f> +inline BoundingBox<T4f> expandBounds(const BoundingBox<T4f>& bounds, const T4f* pIt, const T4f* pEnd) { - BoundingBox<Simd4f> result = bounds; + BoundingBox<T4f> result = bounds; for (; pIt != pEnd; ++pIt) { result.mLower = min(result.mLower, *pIt); @@ -76,26 +76,26 @@ inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const return result; } -template <typename Simd4f> -inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b) +template <typename T4f> +inline BoundingBox<T4f> expandBounds(const BoundingBox<T4f>& a, const BoundingBox<T4f>& b) { - BoundingBox<Simd4f> result; + BoundingBox<T4f> result; result.mLower = min(a.mLower, b.mLower); result.mUpper = max(a.mUpper, b.mUpper); return result; } -template <typename Simd4f> -inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b) +template <typename T4f> +inline BoundingBox<T4f> intersectBounds(const BoundingBox<T4f>& a, const BoundingBox<T4f>& b) { - BoundingBox<Simd4f> result; + BoundingBox<T4f> result; result.mLower = max(a.mLower, b.mLower); result.mUpper = min(a.mUpper, b.mUpper); return result; } -template <typename Simd4f> -inline bool isEmptyBounds(const BoundingBox<Simd4f>& a) +template <typename T4f> +inline bool isEmptyBounds(const BoundingBox<T4f>& a) { return anyGreater(a.mLower, a.mUpper) != 0; } diff --git a/NvCloth/src/ClothClone.h b/NvCloth/src/ClothClone.h index 386fee6..7145da5 100644 --- a/NvCloth/src/ClothClone.h +++ b/NvCloth/src/ClothClone.h @@ -29,12 +29,12 @@ #pragma once -#include "../SwFactory.h" -#include "../SwFabric.h" -#include "../SwCloth.h" +#include "SwFactory.h" +#include "SwFabric.h" +#include "SwCloth.h" -#include "../ClothImpl.h" -#include "../ClothBase.h" +#include "ClothImpl.h" +#include "ClothBase.h" #include "NvCloth/Allocator.h" namespace nv diff --git a/NvCloth/src/ClothImpl.h b/NvCloth/src/ClothImpl.h index 4d7b28d..24f7732 100644 --- a/NvCloth/src/ClothImpl.h +++ b/NvCloth/src/ClothImpl.h @@ -1220,7 +1220,7 @@ inline float ClothImpl<T>::getLiftCoefficient() const template <typename T> inline void ClothImpl<T>::setFluidDensity(float fluidDensity) { - NV_CLOTH_ASSERT(fluidDensity < 0.f); + NV_CLOTH_ASSERT(fluidDensity > 0.f); if (fluidDensity == mFluidDensity) return; diff --git a/NvCloth/src/IterationState.h b/NvCloth/src/IterationState.h index 224e87e..e18b636 100644 --- a/NvCloth/src/IterationState.h +++ b/NvCloth/src/IterationState.h @@ -72,21 +72,21 @@ inline physx::PxQuat exp(const physx::PxVec3& v) return physx::PxQuat(v.x * scale, v.y * scale, v.z * scale, physx::PxCos(theta)); } -template <typename Simd4f, uint32_t N> -inline void assign(Simd4f (&columns)[N], const physx::PxMat44& matrix) +template <typename T4f, uint32_t N> +inline void assign(T4f (&columns)[N], const physx::PxMat44& matrix) { for (uint32_t i = 0; i < N; ++i) columns[i] = load(nv::cloth::array(matrix[i])); } -template <typename Simd4f> -inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec) +template <typename T4f> +inline T4f transform(const T4f (&columns)[3], const T4f& vec) { return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2]; } -template <typename Simd4f> -inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec) +template <typename T4f> +inline T4f transform(const T4f (&columns)[3], const T4f& translate, const T4f& vec) { return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2]; } @@ -99,17 +99,17 @@ struct IterationStateFactory template <typename MyCloth> IterationStateFactory(MyCloth& cloth, float frameDt); - template <typename Simd4f, typename MyCloth> - IterationState<Simd4f> create(MyCloth const& cloth) const; + template <typename T4f, typename MyCloth> + IterationState<T4f> create(MyCloth const& cloth) const; - template <typename Simd4f> - static Simd4f lengthSqr(Simd4f const& v) + template <typename T4f> + static T4f lengthSqr(T4f const& v) { return dot3(v, v); } - template <typename Simd4f> - static physx::PxVec3 castToPxVec3(const Simd4f& v) + template <typename T4f> + static physx::PxVec3 castToPxVec3(const T4f& v) { return *reinterpret_cast<const physx::PxVec3*>(reinterpret_cast<const char*>(&v)); } @@ -123,7 +123,7 @@ struct IterationStateFactory }; /* solver iterations helper functor */ -template <typename Simd4f> +template <typename T4f> struct IterationState { // call after each iteration @@ -133,15 +133,15 @@ struct IterationState inline float getPreviousAlpha() const; public: - Simd4f mRotationMatrix[3]; // should rename to 'mRotation' + T4f mRotationMatrix[3]; // should rename to 'mRotation' - Simd4f mCurBias; // in local space - Simd4f mPrevBias; // in local space - Simd4f mWind; // delta position per iteration (wind velocity * mIterDt) + T4f mCurBias; // in local space + T4f mPrevBias; // in local space + T4f mWind; // delta position per iteration (wind velocity * mIterDt) - Simd4f mPrevMatrix[3]; - Simd4f mCurMatrix[3]; - Simd4f mDampScaleUpdate; + T4f mPrevMatrix[3]; + T4f mCurMatrix[3]; + T4f mDampScaleUpdate; // iteration counter uint32_t mRemainingIterations; @@ -157,14 +157,14 @@ struct IterationState } // namespace cloth -template <typename Simd4f> -inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const +template <typename T4f> +inline float cloth::IterationState<T4f>::getCurrentAlpha() const { return getPreviousAlpha() + mInvNumIterations; } -template <typename Simd4f> -inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const +template <typename T4f> +inline float cloth::IterationState<T4f>::getPreviousAlpha() const { return 1.0f - mRemainingIterations * mInvNumIterations; } @@ -232,36 +232,36 @@ If you change anything in this function, make sure that ClothCustomFloating and ClothInertia haven't regressed for any choice of solver frequency. */ -template <typename Simd4f, typename MyCloth> -cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const +template <typename T4f, typename MyCloth> +cloth::IterationState<T4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const { - IterationState<Simd4f> result; + IterationState<T4f> result; result.mRemainingIterations = static_cast<uint32_t>(mNumIterations); result.mInvNumIterations = mInvNumIterations; result.mIterDt = mIterDt; - Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity)); - Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity)); + T4f curLinearVelocity = load(array(cloth.mLinearVelocity)); + T4f prevLinearVelocity = load(array(mPrevLinearVelocity)); - Simd4f iterDt = simd4f(mIterDt); - Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt; + T4f iterDt = simd4f(mIterDt); + T4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt; - Simd4f translation = iterDt * curLinearVelocity; + T4f translation = iterDt * curLinearVelocity; // gravity delta per iteration - Simd4f gravity = load(array(cloth.mGravity)) * static_cast<Simd4f>(simd4f(sqr(mIterDtAverage))); + T4f gravity = load(array(cloth.mGravity)) * static_cast<T4f>(simd4f(sqr(mIterDtAverage))); // scale of local particle velocity per iteration - Simd4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent); + T4f dampScale = exp2(load(array(cloth.mLogDamping)) * dampExponent); // adjust for the change in time step during the first iteration - Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio); + T4f firstDampScale = dampScale * simd4f(mIterDtRatio); // portion of negative frame velocity to transfer to particle - Simd4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation; + T4f linearDrag = (gSimd4fOne - exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * translation; // portion of frame acceleration to transfer to particle - Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity); + T4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity); // for inertia, we want to violate newton physics to // match velocity and position as given by the user, which means: @@ -271,13 +271,13 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const // specifically, the portion is alpha=(n+1)/2n and 1-alpha. float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations; - Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha); + T4f curLinearInertia = linearInertia * simd4f(linearAlpha); // rotate to local space (use mRotationMatrix temporarily to hold matrix) physx::PxMat44 invRotation = physx::PxMat44(mCurrentRotation.getConjugate()); assign(result.mRotationMatrix, invRotation); - Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); + T4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); // Previously, we split the bias between previous and current position to // get correct disretized position and velocity. However, this made a @@ -286,23 +286,23 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const // timesteps. Instead, we now apply the entire bias to current position // and accept a less noticeable error for a free falling cloth. - Simd4f bias = gravity - linearDrag; + T4f bias = gravity - linearDrag; result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ; result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ; - Simd4f wind = load(array(cloth.mWind)) * iterDt; // multiply with delta time here already so we don't have to do it inside the solver + T4f wind = load(array(cloth.mWind)) * iterDt; // multiply with delta time here already so we don't have to do it inside the solver result.mWind = transform(result.mRotationMatrix, translation - wind) & maskXYZ; result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f; if (result.mIsTurning) { - Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity))); - Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity))); + T4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity))); + T4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity))); // rotation for one iteration in local space - Simd4f curInvAngle = -iterDt * curAngularVelocity; - Simd4f prevInvAngle = -iterDt * prevAngularVelocity; + T4f curInvAngle = -iterDt * curAngularVelocity; + T4f prevInvAngle = -iterDt * prevAngularVelocity; physx::PxQuat curInvRotation = exp(castToPxVec3(curInvAngle)); physx::PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle)); @@ -312,17 +312,17 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const assign(result.mRotationMatrix, curMatrix); - Simd4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent); - Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia)); - Simd4f angularInertia = load(array(cloth.mAngularInertia)); - Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity; + T4f angularDrag = gSimd4fOne - exp2(load(array(cloth.mAngularLogDrag)) * dampExponent); + T4f centrifugalInertia = load(array(cloth.mCentrifugalInertia)); + T4f angularInertia = load(array(cloth.mAngularInertia)); + T4f angularAcceleration = curAngularVelocity - prevAngularVelocity; - Simd4f epsilon = simd4f(sqrtf(FLT_MIN)); // requirement: sqr(epsilon) > 0 - Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon; - Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon; - Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon; - Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon; - Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon; + T4f epsilon = simd4f(sqrtf(FLT_MIN)); // requirement: sqr(epsilon) > 0 + T4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon; + T4f dragLengthSqr = lengthSqr(T4f(curAngularVelocity * angularDrag)) + epsilon; + T4f centrifugalLengthSqr = lengthSqr(T4f(curAngularVelocity * centrifugalInertia)) + epsilon; + T4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon; + T4f inertiaLengthSqr = lengthSqr(T4f(angularAcceleration * angularInertia)) + epsilon; float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0]; float inertiaScale = @@ -337,11 +337,11 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const inertiaScale; // slightly better in ClothCustomFloating than curInvAngle alone - Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f); - const Simd4f data = lengthSqr(centrifugalVelocity); + T4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f); + const T4f data = lengthSqr(centrifugalVelocity); float centrifugalSqrLength = array(data)[0] * centrifugalScale; - Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale); + T4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale); physx::PxMat33 coriolisMatrix = physx::shdfnd::star(castToPxVec3(coriolisVelocity)); const float* dampScalePtr = array(firstDampScale); @@ -369,7 +369,7 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const } else { - Simd4f minusOne = -static_cast<Simd4f>(gSimd4fOne); + T4f minusOne = -static_cast<T4f>(gSimd4fOne); result.mRotationMatrix[0] = minusOne; result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne); } @@ -380,8 +380,8 @@ cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const return result; } -template <typename Simd4f> -void cloth::IterationState<Simd4f>::update() +template <typename T4f> +void cloth::IterationState<T4f>::update() { if (mIsTurning) { diff --git a/NvCloth/src/NvSimd/NvSimdTypes.h b/NvCloth/src/NvSimd/NvSimdTypes.h index 0625332..dd94b40 100644 --- a/NvCloth/src/NvSimd/NvSimdTypes.h +++ b/NvCloth/src/NvSimd/NvSimdTypes.h @@ -104,7 +104,7 @@ void foo(const float* ptr) #define NV_SIMD_INLINE_ASSEMBLER 1 #endif -/*! \def NV_SIMD_USE_NAMESPACE +/*! \def NV_CLOTH_NO_SIMD_NAMESPACE * \brief Set to 1 to define the SIMD library types and functions inside the nvidia::simd namespace. * By default, the types and functions defined in this header live in the global namespace. * This is because MSVC (prior to version 12, Visual Studio 2013) does an inferior job at optimizing @@ -116,11 +116,11 @@ void foo(const float* ptr) * __m128i are wrapped into structs. Arguments need to be passed by reference in this mode. * \see NV_SIMD_VECTORCALL, Simd4fArg */ -#if defined NV_SIMD_USE_NAMESPACE&& NV_SIMD_USE_NAMESPACE +#ifndef NV_CLOTH_NO_SIMD_NAMESPACE #define NV_SIMD_NAMESPACE_BEGIN \ namespace nv \ { \ - namespace simd \ + namespace cloth \ { #define NV_SIMD_NAMESPACE_END \ } \ diff --git a/NvCloth/src/PointInterpolator.h b/NvCloth/src/PointInterpolator.h index b9db131..75e1dcf 100644 --- a/NvCloth/src/PointInterpolator.h +++ b/NvCloth/src/PointInterpolator.h @@ -37,7 +37,7 @@ namespace cloth { // acts as a poor mans random access iterator -template <typename Simd4f, typename BaseIterator> +template <typename T4f, typename BaseIterator> class LerpIterator { @@ -50,12 +50,12 @@ class LerpIterator } // return the interpolated point at a given index - inline Simd4f operator[](size_t index) const + inline T4f operator[](size_t index) const { return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha; } - inline Simd4f operator*() const + inline T4f operator*() const { return (*this)[0]; } @@ -70,13 +70,13 @@ class LerpIterator private: // interpolation parameter - const Simd4f mAlpha; + const T4f mAlpha; BaseIterator mStart; BaseIterator mTarget; }; -template <typename Simd4f, size_t Stride> +template <typename T4f, size_t Stride> class UnalignedIterator { @@ -87,12 +87,12 @@ class UnalignedIterator { } - inline Simd4f operator[](size_t index) const + inline T4f operator[](size_t index) const { return load(mPointer + index * Stride); } - inline Simd4f operator*() const + inline T4f operator*() const { return (*this)[0]; } @@ -109,15 +109,15 @@ class UnalignedIterator }; // acts as an iterator but returns a constant -template <typename Simd4f> +template <typename T4f> class ConstantIterator { public: - ConstantIterator(const Simd4f& value) : mValue(value) + ConstantIterator(const T4f& value) : mValue(value) { } - inline Simd4f operator*() const + inline T4f operator*() const { return mValue; } @@ -129,20 +129,20 @@ class ConstantIterator private: ConstantIterator& operator = (const ConstantIterator&); - const Simd4f mValue; + const T4f mValue; }; // wraps an iterator with constant scale and bias -template <typename Simd4f, typename BaseIterator> +template <typename T4f, typename BaseIterator> class ScaleBiasIterator { public: - ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias) + ScaleBiasIterator(BaseIterator base, const T4f& scale, const T4f& bias) : mScale(scale), mBias(bias), mBaseIterator(base) { } - inline Simd4f operator*() const + inline T4f operator*() const { return (*mBaseIterator) * mScale + mBias; } @@ -156,8 +156,8 @@ class ScaleBiasIterator private: ScaleBiasIterator& operator = (const ScaleBiasIterator&); - const Simd4f mScale; - const Simd4f mBias; + const T4f mScale; + const T4f mBias; BaseIterator mBaseIterator; }; diff --git a/NvCloth/src/SwCollision.cpp b/NvCloth/src/SwCollision.cpp index 89df8a5..0aa196d 100644 --- a/NvCloth/src/SwCollision.cpp +++ b/NvCloth/src/SwCollision.cpp @@ -40,6 +40,7 @@ using namespace nv; using namespace physx; +using namespace cloth; // the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision template <typename T4f> @@ -160,31 +161,41 @@ void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, cloth::ConeData* cIt = dst; for (const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt) { + // w element contains sphere radii PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]); PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]); PxVec4 center = (second + first) * 0.5f; - PxVec4 axis = (second - first) * 0.5f; + PxVec4 axis = (second - first) * 0.5f; //half axis + //axiw.w = half of radii difference - float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z; - float sqrConeLength = sqrAxisLength - cloth::sqr(axis.w); + // |Axis|^2 + float sqrAxisHalfLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z; - float invAxisLength = 1 / sqrtf(sqrAxisLength); - float invConeLength = 1 / sqrtf(sqrConeLength); + // http://jwilson.coe.uga.edu/emt669/Student.Folders/Kertscher.Jeff/Essay.3/Tangents.html + // |Axis|^2 = |Cone|^2 + (sphere2Radius-sphere1Radius)^2 + float sqrConeHalfLength = sqrAxisHalfLength - cloth::sqr(axis.w); - if (sqrConeLength <= 0.0f) - invAxisLength = invConeLength = 0.0f; + float invAxisHalfLength = 1 / sqrtf(sqrAxisHalfLength); + float invConeHalfLength = 1 / sqrtf(sqrConeHalfLength); - float axisLength = sqrAxisLength * invAxisLength; - float slope = axis.w * invConeLength; + if (sqrConeHalfLength <= 0.0f) + invAxisHalfLength = invConeHalfLength = 0.0f; + + float axisHalfLength = sqrAxisHalfLength * invAxisHalfLength; + float slope = axis.w * invConeHalfLength; cIt->center = PxVec3(center.x, center.y, center.z ); - cIt->radius = (axis.w + first.w) * invConeLength * axisLength; - cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength; + cIt->radius = (axis.w + first.w) * invConeHalfLength * axisHalfLength; //cone radius in the center + cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisHalfLength; cIt->slope = slope; - cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisLength); - cIt->halfLength = axisLength; + // cos()^2 = 1.0 - (radius difference / axis length)^2 + // cos()^2 = 1.0 - (opposite/hypotenuse)^2 + // cos()^2 = 1.0 - sin(angle between c2c1 and c2t1)^2 + // cos()^2 = 1.0 - sin(angle between axis and c2t1)^2 + cIt->sqrCosine = 1.0f - cloth::sqr(axis.w * invAxisHalfLength); + cIt->halfLength = axisHalfLength; uint32_t firstMask = 0x1u << iIt->first; cIt->firstMask = firstMask; @@ -407,12 +418,14 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt) { static const int maxIndex = sGridSize - 1; + uint32_t mask = 0x1; //single bit mask for current sphere const SphereData* sEnd = sIt + mClothData.mNumSpheres; - for (uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1) + for (; sIt != sEnd; ++sIt, mask <<= 1) { T4f sphere = loadAligned(array(sIt->center)); T4f radius = splat<3>(sphere); + //calculate the first and last cell index, for each axis, that contains the sphere T4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, gSimd4fZero)); T4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength)); @@ -422,11 +435,14 @@ void cloth::SwCollision<T4f>::buildSphereAcceleration(const SphereData* sIt) uint32_t* firstIt = reinterpret_cast<uint32_t*>(mSphereGrid); uint32_t* lastIt = firstIt + 3 * sGridSize; + //loop through the 3 axes for (uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize) { + //mark the sphere and everything to the right for (int j = firstIdx[i]; j <= maxIndex; ++j) firstIt[j] |= mask; + //mark the sphere and everything to the left for (int j = lastIdx[i]; j >= 0; --j) lastIt[j] |= mask; } @@ -469,17 +485,23 @@ void cloth::SwCollision<T4f>::mergeAcceleration(uint32_t* firstIt) template <typename T4f> bool cloth::SwCollision<T4f>::buildAcceleration() { - // determine sphere bbox + // determine single bounding box around all spheres BoundingBox<T4f> sphereBounds = expandBounds(emptyBounds<T4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres); + + // determine single bounding box around all particles BoundingBox<T4f> particleBounds = loadBounds<T4f>(mClothData.mCurBounds); + if (mClothData.mEnableContinuousCollision) { + // extend bounds to include movement from previous frame sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres); particleBounds = expandBounds(particleBounds, loadBounds<T4f>(mClothData.mPrevBounds)); } BoundingBox<T4f> bounds = intersectBounds(sphereBounds, particleBounds); + + // no collision checks needed if the intersection between particle bounds and sphere bounds is empty T4f edgeLength = (bounds.mUpper - bounds.mLower) & ~static_cast<T4f>(sMaskW); if (!allGreaterEqual(edgeLength, gSimd4fZero)) return false; @@ -490,6 +512,7 @@ bool cloth::SwCollision<T4f>::buildAcceleration() const T4f expandedEdgeLength = max(expandedUpper - expandedLower, gSimd4fEpsilon); // make grid minimal thickness and strict upper bound of spheres + // grid maps bounds to 0-7 space (sGridLength =~= 8) mGridScale = sGridLength * recip<1>(expandedEdgeLength); mGridBias = -expandedLower * mGridScale; array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles() @@ -655,8 +678,8 @@ struct cloth::SwCollision<T4f>::ImpulseAccumulator mNumCollisions = mNumCollisions + (gSimd4fOne & mask); } - T4f mDeltaX, mDeltaY, mDeltaZ; - T4f mVelX, mVelY, mVelZ; + T4f mDeltaX, mDeltaY, mDeltaZ; //depenetration delta + T4f mVelX, mVelY, mVelZ; //frame offset of the collision shape (velocity * dt) T4f mNumCollisions; }; @@ -684,12 +707,15 @@ FORCE_INLINE void cloth::SwCollision<T4f>::collideSpheres(const T4i& sphereMask, T4f sqrDistance = gSimd4fEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; T4f negativeScale = gSimd4fOne - rsqrt(sqrDistance) * splat<3>(sphere); + // negativeScale = 1 - radius/|position-sphere| T4f contactMask; if (!anyGreater(gSimd4fZero, negativeScale, contactMask)) continue; accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask); + // -= delta * negativeScale + // = delta - delta * radius/|position-sphere| if (frictionEnabled) { @@ -730,10 +756,13 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc T4f center = loadAligned(centerPtr, offset); + // offset from center of cone to particle + // delta = pos - center T4f deltaX = positions[0] - splat<0>(center); T4f deltaY = positions[1] - splat<1>(center); T4f deltaZ = positions[2] - splat<2>(center); + //axis of the cone T4f axis = loadAligned(axisPtr, offset); T4f axisX = splat<0>(axis); @@ -741,12 +770,16 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc T4f axisZ = splat<2>(axis); T4f slope = splat<3>(axis); + // project delta onto axis T4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; + // interpolate radius T4f radius = dot * slope + splat<3>(center); // set radius to zero if cone is culled radius = max(radius, gSimd4fZero) & ~culled; + // distance to axis + // sqrDistance = |delta|^2 - |dot|^2 T4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot; T4i auxiliary = loadAligned(auxiliaryPtr, offset); @@ -765,6 +798,8 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc sqrDistance = max(sqrDistance, gSimd4fEpsilon); T4f invDistance = rsqrt(sqrDistance); + + //offset base to take slope in to account T4f base = dot + slope * sqrDistance * invDistance; // force left/rightMask to false if not inside cone @@ -780,6 +815,7 @@ cloth::SwCollision<T4f>::collideCones(const T4f* __restrict positions, ImpulseAc shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask); shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask); + //contact normal direction deltaX = deltaX - base * axisX; deltaY = deltaY - base * axisY; deltaZ = deltaZ - base * axisZ; @@ -1173,7 +1209,9 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co T4f ny = deltaY * rcpDelta; T4f nz = deltaZ * rcpDelta; - // calculate relative velocity scaled by number of collisions + // calculate relative velocity + // velXYZ is scaled by one over the number of collisions since all collisions accumulate into + // that variable during collision detection T4f rvx = curPos[0] - prevPos[0] - velX * scale; T4f rvy = curPos[1] - prevPos[1] - velY * scale; T4f rvz = curPos[2] - prevPos[2] - velZ * scale; @@ -1186,7 +1224,7 @@ PX_INLINE void calculateFrictionImpulse(const T4f& deltaX, const T4f& deltaY, co T4f rvty = rvy - rvn * ny; T4f rvtz = rvz - rvn * nz; - // calculate magnitude of vt + // calculate magnitude of relative tangential velocity T4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + gSimd4fEpsilon); // magnitude of friction impulse (cannot be greater than -vt) @@ -1206,7 +1244,7 @@ void cloth::SwCollision<T4f>::collideParticles() const T4f massScale = simd4f(mClothData.mCollisionMassScale); const bool frictionEnabled = mClothData.mFrictionScale > 0.0f; - const T4f frictionScale = simd4f(mClothData.mFrictionScale); + const T4f frictionScale = simd4f(mClothData.mFrictionScale); //[arameter set by user T4f curPos[4]; T4f prevPos[4]; @@ -1214,16 +1252,20 @@ void cloth::SwCollision<T4f>::collideParticles() float* __restrict prevIt = mClothData.mPrevParticles; float* __restrict pIt = mClothData.mCurParticles; float* __restrict pEnd = pIt + mClothData.mNumParticles * 4; + //loop over particles 4 at a time for (; pIt < pEnd; pIt += 16, prevIt += 16) { curPos[0] = loadAligned(pIt, 0); curPos[1] = loadAligned(pIt, 16); curPos[2] = loadAligned(pIt, 32); curPos[3] = loadAligned(pIt, 48); - transpose(curPos[0], curPos[1], curPos[2], curPos[3]); + transpose(curPos[0], curPos[1], curPos[2], curPos[3]); //group values by axis in simd structure ImpulseAccumulator accum; + + //first collide cones T4i sphereMask = collideCones(curPos, accum); + //pass on hit mask to ignore sphere parts that are inside the cones collideSpheres(sphereMask, curPos, accum); T4f mask; @@ -1267,6 +1309,7 @@ void cloth::SwCollision<T4f>::collideParticles() curPos[3] = select(mask, curPos[3] * scale, curPos[3]); } + //apply average de-penetration delta curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions; curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions; curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions; diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp index b9b494f..50be414 100644 --- a/NvCloth/src/SwInterCollision.cpp +++ b/NvCloth/src/SwInterCollision.cpp @@ -39,6 +39,7 @@ using namespace nv; using namespace physx; +using namespace cloth; namespace { diff --git a/NvCloth/src/SwSelfCollision.cpp b/NvCloth/src/SwSelfCollision.cpp index 095943d..ec5a166 100644 --- a/NvCloth/src/SwSelfCollision.cpp +++ b/NvCloth/src/SwSelfCollision.cpp @@ -37,23 +37,24 @@ #endif using namespace nv; +using namespace cloth; namespace { -const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); - // returns sorted indices, output needs to be at least 2*(last - first) + 1024 void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out) { + // this sort uses a radix (bin) size of 256, requiring 4 bins to sort the 32 bit keys uint16_t n = uint16_t(last - first); uint16_t* buffer = out + 2 * n; uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 }; + //zero the buffer memory used for the 4 buckets memset(buffer, 0, 1024 * sizeof(uint16_t)); - // build 3 histograms in one pass + // build 4 histograms in one pass for (const uint32_t* __restrict it = first; it != last; ++it) { uint32_t key = *it; @@ -64,7 +65,7 @@ void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out) } // convert histograms to offset tables in-place - uint16_t sums[4] = {}; + uint16_t sums[4] = {0, 0, 0, 0}; for (uint32_t i = 0; i < 256; ++i) { uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]); @@ -133,6 +134,7 @@ bool isSelfCollisionEnabled(const cloth::SwCloth& cloth) return std::min(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f; } +// align x to a 2 byte boundary inline uint32_t align2(uint32_t x) { return (x + 1) & ~1; @@ -146,7 +148,7 @@ cloth::SwSelfCollision<T4f>::SwSelfCollision(cloth::SwClothData& clothData, clot { mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance); mCollisionSquareDistance = mCollisionDistance * mCollisionDistance; - mStiffness = sMaskXYZ & static_cast<T4f>(simd4f(mClothData.mSelfCollisionStiffness)); + mStiffness = gSimd4fMaskXYZ & static_cast<T4f>(simd4f(mClothData.mSelfCollisionStiffness)); } template <typename T4f> @@ -170,11 +172,12 @@ void cloth::SwSelfCollision<T4f>::operator()() uint32_t hashAxis0 = (sweepAxis + 1) % 3; uint32_t hashAxis1 = (sweepAxis + 2) % 3; - // reserve 0, 127, and 65535 for sentinel + // reserve 0, 255, and 65535 for sentinel T4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength); array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533; T4f one = gSimd4fOne; + // +1 for sentinel 0 offset T4f gridSize = simd4f(254.0f); array(gridSize)[sweepAxis] = 65534.0f; @@ -194,6 +197,7 @@ void cloth::SwSelfCollision<T4f>::operator()() // create keys for (uint32_t i = 0; i < numIndices; ++i) { + // use all particles when no self collision indices are set uint32_t index = indices ? indices[i] : i; // grid coordinate @@ -207,28 +211,32 @@ void cloth::SwSelfCollision<T4f>::operator()() keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24)); } - // compute sorted keys indices + // compute sorted key indices radixSort(keys, keys + numIndices, sortedIndices); // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel) - uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769]; + // sortedIndices[2 * numIndices + 768 + 1] is actually histograms[3]+1 from radixSort + uint16_t firstColumnSize = sortedIndices[2 * numIndices + 768 + 1]; - // sort keys + // sort keys using the sortedIndices for (uint32_t i = 0; i < numIndices; ++i) sortedKeys[i] = keys[sortedIndices[i]]; sortedKeys[numIndices] = uint32_t(-1); // sentinel + // do user provided index array indirection here if we have one + // so we don't need to keep branching for this condition later if (indices) { // sort indices (into no-longer-needed keys array) - const uint16_t* __restrict permutation = sortedIndices; + // the keys array is no longer used so we can reuse it to store indices[sortedIndices[i]] + const uint16_t* __restrict oldSortedIndices = sortedIndices; sortedIndices = reinterpret_cast<uint16_t*>(keys); for (uint32_t i = 0; i < numIndices; ++i) - sortedIndices[i] = uint16_t(indices[permutation[i]]); + sortedIndices[i] = uint16_t(indices[oldSortedIndices[i]]); } // calculate the number of buckets we need to search forward - const Simd4i data = intFloor(gridScale * mCollisionDistance); + const Simd4i data = intFloor(gridScale * mCollisionDistance); //equal to or larger than floor(mCollisionDistance) uint32_t collisionDistance = 2 + static_cast<uint32_t>(array(data)[sweepAxis]); // collide particles @@ -310,7 +318,7 @@ void cloth::SwSelfCollision<T4f>::collideParticles(T4f& pos0, T4f& pos1, const T T4f ratio = mCollisionDistance * rsqrt(distSqr); T4f scale = mStiffness * recip(gSimd4fEpsilon + w0 + w1); - T4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ; + T4f delta = (scale * (diff - diff * ratio)) & gSimd4fMaskXYZ; pos0 = pos0 + delta * w0; pos1 = pos1 - delta * w1; @@ -325,42 +333,71 @@ template <bool useRestParticles> void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize, const uint16_t* indices, uint32_t collisionDistance) { + //keys is an array of bucket keys for the particles + //indices is an array of particle indices + //collisionDistance is the number of buckets along the sweep axis we need to search after the current one + T4f* __restrict particles = reinterpret_cast<T4f*>(mClothData.mCurParticles); T4f* __restrict restParticles = useRestParticles ? reinterpret_cast<T4f*>(mClothData.mRestPositions) : particles; - const uint32_t bucketMask = uint16_t(-1); + //16 lsb's are for the bucket + const uint32_t bucketMask = 0x0000ffff; + // offsets for cells (not along the sweep axis) + // [1] [3]-[1] [3] [1]+[3] const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 }; const uint32_t* __restrict kFirst[5]; const uint32_t* __restrict kLast[5]; + /* + We use 5 first/last pairs to search the following cells + ===================== + | | | | | | + ===================== + | | | 0 | 1 | | + ===================== + | | 2 | 3 | 4 | | + ===================== + | | | | | | + ===================== + With 0 as the origin. + This way collisions won't be double reported. + */ + { // optimization: scan forward iterator starting points once instead of 9 times const uint32_t* __restrict kIt = keys; uint32_t key = *kIt; + //clamp first/lastKey to bucket uint32_t firstKey = key - std::min(collisionDistance, key & bucketMask); uint32_t lastKey = std::min(key + collisionDistance, key | bucketMask); + //sweep 0 kFirst[0] = kIt; + //find next key in keys that is past lastKey while (*kIt < lastKey) ++kIt; kLast[0] = kIt; + //sweep 1...4 for (uint32_t k = 1; k < 5; ++k) { + // scan forward start point for (uint32_t n = firstKey + keyOffsets[k]; *kIt < n;) ++kIt; kFirst[k] = kIt; + // scan forward end point for (uint32_t n = lastKey + keyOffsets[k]; *kIt < n;) ++kIt; kLast[k] = kIt; - // jump forward once to second column - kIt = keys + firstColumnSize; + // jump forward once to second column to go from cell offset 1 to 2 quickly + if(firstColumnSize) + kIt = keys + firstColumnSize; firstColumnSize = 0; } } @@ -371,7 +408,8 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_ const uint16_t* __restrict jIt; const uint16_t* __restrict jEnd; - for (; iIt != iEnd; ++iIt, ++kFirst[0]) + //loop through all indices + for (; iIt < iEnd; ++iIt, ++kFirst[0]) { NV_CLOTH_ASSERT(*iIt < mClothData.mNumParticles); @@ -390,8 +428,8 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_ ++kLast[0]; // process potential colliders of same cell - jEnd = indices + (kLast[0] - keys); - for (jIt = iIt + 1; jIt != jEnd; ++jIt) + jEnd = indices + (kLast[0] - keys); //calculate index from key pointer + for (jIt = iIt + 1; jIt < jEnd; ++jIt) collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]); // process neighbor cells @@ -407,7 +445,7 @@ void cloth::SwSelfCollision<T4f>::collideParticles(const uint32_t* keys, uint16_ // process potential colliders jEnd = indices + (kLast[k] - keys); - for (jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt) + for (jIt = indices + (kFirst[k] - keys); jIt < jEnd; ++jIt) collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]); } diff --git a/NvCloth/src/SwSolver.cpp b/NvCloth/src/SwSolver.cpp index c7437e1..f0f9152 100644 --- a/NvCloth/src/SwSolver.cpp +++ b/NvCloth/src/SwSolver.cpp @@ -50,7 +50,7 @@ bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, Iteratio } using namespace nv; - +using namespace cloth; #if NV_SIMD_SIMD typedef Simd4f Simd4fType; #else @@ -93,12 +93,17 @@ void sortTasks(shdfnd::Array<T, cloth::NonTrackingAllocator>& tasks) void cloth::SwSolver::addCloth(Cloth* cloth) { - SwCloth& swCloth = *static_cast<SwCloth*>(cloth); - - mSimulatedCloths.pushBack(SimulatedCloth(swCloth, this)); + addClothAppend(cloth); sortTasks(mSimulatedCloths); +} - mCloths.pushBack(&swCloth); +void cloth::SwSolver::addCloths(Range<Cloth*> cloths) +{ + for (uint32_t i = 0; i < cloths.size(); ++i) + { + addClothAppend(*(cloths.begin() + i)); + } + sortTasks(mSimulatedCloths); } void cloth::SwSolver::removeCloth(Cloth* cloth) @@ -221,6 +226,16 @@ void cloth::SwSolver::interCollision() collider(); } +void cloth::SwSolver::addClothAppend(Cloth* cloth) +{ + SwCloth& swCloth = *static_cast<SwCloth*>(cloth); + NV_CLOTH_ASSERT(mCloths.find(&swCloth) == mCloths.end()); + + mSimulatedCloths.pushBack(SimulatedCloth(swCloth, this)); + + mCloths.pushBack(&swCloth); +} + void cloth::SwSolver::beginFrame() const { mSimulateProfileEventData = NV_CLOTH_PROFILE_START_CROSSTHREAD("cloth::SwSolver::simulate", 0); @@ -287,9 +302,14 @@ void cloth::SwSolver::SimulatedCloth::Simulate() // construct kernel functor and execute #if NV_ANDROID - // if (!neonSolverKernel(cloth, data, allocator, factory)) -#endif + if (!neonSolverKernel(*mCloth, data, allocator, factory)) + { + //NV_CLOTH_LOG_WARNING("No NEON CPU support detected. Falling back to scalar types."); + SwSolverKernel<Scalar4f>(*mCloth, data, allocator, factory)(); + } +#else SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory)(); +#endif data.reconcile(*mCloth); // update cloth } diff --git a/NvCloth/src/SwSolver.h b/NvCloth/src/SwSolver.h index c7b177b..ad58a7c 100644 --- a/NvCloth/src/SwSolver.h +++ b/NvCloth/src/SwSolver.h @@ -64,6 +64,7 @@ class SwSolver : public Solver virtual ~SwSolver(); virtual void addCloth(Cloth*) override; + virtual void addCloths(Range<Cloth*> cloths) override; virtual void removeCloth(Cloth*) override; virtual int getNumCloths() const override; virtual Cloth * const * getClothList() const override; @@ -112,6 +113,10 @@ class SwSolver : public Solver } private: + // add cloth helper functions + void addClothAppend(Cloth* cloth); + + // simulate helper functions void beginFrame() const; void endFrame() const; diff --git a/NvCloth/src/SwSolverKernel.cpp b/NvCloth/src/SwSolverKernel.cpp index eec7956..2181b1e 100644 --- a/NvCloth/src/SwSolverKernel.cpp +++ b/NvCloth/src/SwSolverKernel.cpp @@ -103,6 +103,7 @@ const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA #endif using namespace nv; +using namespace cloth; namespace { @@ -209,11 +210,14 @@ void constrainMotion(T4f* __restrict curIt, const T4f* __restrict curEnd, const T4f isPositive; if (anyGreater(slack, gSimd4fZero, isPositive)) { - // set invMass to zero if radius is zero + // set invMass to zero if radius is zero (xyz will be unchanged) + // curPos.w = radius > 0 ? curPos.w : 0 + // the first three components are compared against -FLT_MAX which is always true curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ); curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ); curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ); curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ); + // we don't have to splat the last one as the 4th element is already in the right place slack = slack * stiffness & isPositive; @@ -367,7 +371,7 @@ void solveConstraints(float* __restrict posIt, const float* __restrict rIt, cons } } -#if PX_WINDOWS_FAMILY +#if PX_WINDOWS_FAMILY && NV_SIMD_SSE2 #include "sse2/SwSolveConstraints.h" #endif diff --git a/NvCloth/src/cuda/CuCloth.cpp b/NvCloth/src/cuda/CuCloth.cpp index 3e6175b..4131b04 100644 --- a/NvCloth/src/cuda/CuCloth.cpp +++ b/NvCloth/src/cuda/CuCloth.cpp @@ -423,7 +423,7 @@ void CuCloth::clearParticleAccelerations() { CuContextLock contextLock(mFactory); CuDeviceVector<PxVec4>(mFactory.mContext).swap(mParticleAccelerations); - CuHostVector<PxVec4, CU_MEMHOSTALLOC_DEVICEMAP>::Type().swap(mParticleAccelerationsHostCopy); + CuHostVector<PxVec4, CU_MEMHOSTALLOC_DEVICEMAP>::Type(mFactory.mContext).swap(mParticleAccelerationsHostCopy); wakeUp(); } diff --git a/NvCloth/src/cuda/CuClothData.h b/NvCloth/src/cuda/CuClothData.h index dd836fd..5f2d7c6 100644 --- a/NvCloth/src/cuda/CuClothData.h +++ b/NvCloth/src/cuda/CuClothData.h @@ -32,7 +32,7 @@ #include <foundation/Px.h> #ifndef __CUDACC__ -#include "simd.h" +#include "Simd.h" #endif namespace nv diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp index 9bc20db..957f912 100644 --- a/NvCloth/src/cuda/CuFabric.cpp +++ b/NvCloth/src/cuda/CuFabric.cpp @@ -31,6 +31,7 @@ #include "CuContextLock.h" #include "CuFactory.h" #include <PsUtilities.h> +#include <limits.h> using namespace physx; diff --git a/NvCloth/src/cuda/CuPinnedAllocator.h b/NvCloth/src/cuda/CuPinnedAllocator.h index 8b1787b..9939324 100644 --- a/NvCloth/src/cuda/CuPinnedAllocator.h +++ b/NvCloth/src/cuda/CuPinnedAllocator.h @@ -29,6 +29,8 @@ #pragma once +#include <utility> + #include "CuCheckSuccess.h" #include "NvCloth/Allocator.h" @@ -102,7 +104,7 @@ public: void destroy(T* ptr) { - core::unused(ptr); + PX_UNUSED(ptr); ptr->~T(); } @@ -122,13 +124,13 @@ bool operator!=(const CuHostAllocator<T1, Flag1>&, const CuHostAllocator<T2, Fla return false; } -//Use CuHostVectorImpl instead of physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>> +//Use CuHostVectorImpl instead of physx::shdfnd::Array<T, CuHostAllocator<T, Flags>> //This entire class is just to make sure that the mDevicePtr from the CuHostAllocator is properly swapped together with mData template <typename T, unsigned Flags = 0> -class CuHostVectorImpl : public physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>> +class CuHostVectorImpl : public physx::shdfnd::Array<T, CuHostAllocator<T, Flags>> { - typedef physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>> Super; - typedef typename CuHostAllocator<T, Flags> Alloc; + typedef physx::shdfnd::Array<T, CuHostAllocator<T, Flags>> Super; + typedef CuHostAllocator<T, Flags> Alloc; public: explicit CuHostVectorImpl(const physx::PxEMPTY v):Super(v){} PX_INLINE explicit CuHostVectorImpl(const Alloc& alloc = Alloc()):Super(alloc){} @@ -142,10 +144,10 @@ public: PX_INLINE explicit CuHostVectorImpl(const T* first, const T* last, const Alloc& alloc = Alloc()):Super(first,last,alloc){} - void swap(physx::shdfnd::Array<T, typename CuHostAllocator<T, Flags>>& other) + void swap(CuHostVectorImpl<T, Flags>& other) { - PX_ASSERT(mContext == other.mContext); - physx::shdfnd::swap(mDevicePtr, other.mDevicePtr); + NV_CLOTH_ASSERT(this->mContext == other.mContext); + physx::shdfnd::swap(this->mDevicePtr, other.mDevicePtr); Super::swap(other); } }; diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp index f0e328f..7ef1d32 100644 --- a/NvCloth/src/cuda/CuSolver.cpp +++ b/NvCloth/src/cuda/CuSolver.cpp @@ -302,6 +302,28 @@ cloth::CuSolver::~CuSolver() mFactory.mSolverCount--; } +void cloth::CuSolver::addClothAppend(Cloth* cloth) +{ + CuCloth& cuCloth = *static_cast<CuCloth*>(cloth); + + NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end()); + + mCloths.pushBack(&cuCloth); + // trigger update of mClothData array + cuCloth.notifyChanged(); +} + +void cloth::CuSolver::addClothUpdateData() +{ + CuContextLock contextLock(mFactory); + + // resize containers and update kernel data + mClothDataHostCopy.resize(mCloths.size()); + mClothData.resize(mCloths.size()); + mFrameData.resize(mCloths.size()); + updateKernelData(); +} + void cloth::CuSolver::updateKernelData() { mKernelDataHost.mClothIndex = mClothIndex.get(); @@ -326,24 +348,17 @@ struct ClothSimCostGreater void cloth::CuSolver::addCloth(Cloth* cloth) { - CuCloth& cuCloth = *static_cast<CuCloth*>(cloth); - - NV_CLOTH_ASSERT(mCloths.find(&cuCloth) == mCloths.end()); - - mCloths.pushBack(&cuCloth); - // trigger update of mClothData array - cuCloth.notifyChanged(); - - // sort cloth instances by size - shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator()); - - CuContextLock contextLock(mFactory); + addClothAppend(cloth); + addClothUpdateData(); +} - // resize containers and update kernel data - mClothDataHostCopy.resize(mCloths.size()); - mClothData.resize(mCloths.size()); - mFrameData.resize(mCloths.size()); - updateKernelData(); +void cloth::CuSolver::addCloths(Range<Cloth*> cloths) +{ + for (uint32_t i = 0; i < cloths.size(); ++i) + { + addClothAppend(*(cloths.begin() + i)); + } + addClothUpdateData(); } void cloth::CuSolver::removeCloth(Cloth* cloth) @@ -401,7 +416,8 @@ void cloth::CuSolver::endSimulation() int cloth::CuSolver::getSimulationChunkCount() const { - return 1; + // 0 chunks when no cloth present in the solver, 1 otherwise + return getNumCloths() != 0; } void cloth::CuSolver::beginFrame() diff --git a/NvCloth/src/cuda/CuSolver.h b/NvCloth/src/cuda/CuSolver.h index 0406e00..b4c6d6b 100644 --- a/NvCloth/src/cuda/CuSolver.h +++ b/NvCloth/src/cuda/CuSolver.h @@ -58,6 +58,7 @@ public: ~CuSolver(); virtual void addCloth(Cloth*) override; + virtual void addCloths(Range<Cloth*> cloths) override; virtual void removeCloth(Cloth*) override; virtual int getNumCloths() const override; virtual Cloth * const * getClothList() const override; @@ -103,6 +104,10 @@ public: } private: + // add cloth helper functions + void addClothAppend(Cloth* cloth); + void addClothUpdateData(); + void updateKernelData(); // context needs to be acquired // simulate helper functions diff --git a/NvCloth/src/dx/DxBatchedVector.h b/NvCloth/src/dx/DxBatchedVector.h index 2c5e313..76b9b22 100644 --- a/NvCloth/src/dx/DxBatchedVector.h +++ b/NvCloth/src/dx/DxBatchedVector.h @@ -297,7 +297,7 @@ class DxBatchedVector void swap(DxBatchedVector<T>& other) { - PX_ASSERT(&mStorage == &other.mStorage); + NV_CLOTH_ASSERT(&mStorage == &other.mStorage); physx::shdfnd::swap(mOffset, other.mOffset); physx::shdfnd::swap(mSize, other.mSize); physx::shdfnd::swap(mCapacity, other.mCapacity); diff --git a/NvCloth/src/dx/DxClothData.h b/NvCloth/src/dx/DxClothData.h index f91d37d..4da9be2 100644 --- a/NvCloth/src/dx/DxClothData.h +++ b/NvCloth/src/dx/DxClothData.h @@ -31,7 +31,7 @@ #pragma once #include <foundation/Px.h> -#include "simd.h" +#include "Simd.h" namespace nv { diff --git a/NvCloth/src/dx/DxFactory.cpp b/NvCloth/src/dx/DxFactory.cpp index fbf0c51..91f5125 100644 --- a/NvCloth/src/dx/DxFactory.cpp +++ b/NvCloth/src/dx/DxFactory.cpp @@ -251,15 +251,15 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p void cloth::DxFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const { - PX_ASSERT(&cloth.getFactory() == this); + NV_CLOTH_ASSERT(&cloth.getFactory() == this); const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth); - PX_ASSERT(spheres.empty() || spheres.size() == dxCloth.mStartCollisionSpheres.size()); - PX_ASSERT(capsules.empty() || capsules.size() == dxCloth.mCapsuleIndices.size() * 2); - PX_ASSERT(planes.empty() || planes.size() == dxCloth.mStartCollisionPlanes.size()); - PX_ASSERT(convexes.empty() || convexes.size() == dxCloth.mConvexMasks.size()); - PX_ASSERT(triangles.empty() || triangles.size() == dxCloth.mStartCollisionTriangles.size()); + NV_CLOTH_ASSERT(spheres.empty() || spheres.size() == dxCloth.mStartCollisionSpheres.size()); + NV_CLOTH_ASSERT(capsules.empty() || capsules.size() == dxCloth.mCapsuleIndices.size() * 2); + NV_CLOTH_ASSERT(planes.empty() || planes.size() == dxCloth.mStartCollisionPlanes.size()); + NV_CLOTH_ASSERT(convexes.empty() || convexes.size() == dxCloth.mConvexMasks.size()); + NV_CLOTH_ASSERT(triangles.empty() || triangles.size() == dxCloth.mStartCollisionTriangles.size()); // collision spheres are in pinned memory, so memcpy directly if (!dxCloth.mStartCollisionSpheres.empty() && !spheres.empty()) @@ -296,13 +296,13 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p void cloth::DxFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const { - PX_ASSERT(&cloth.getFactory() == this); + NV_CLOTH_ASSERT(&cloth.getFactory() == this); const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth); if (dxCloth.mMotionConstraints.mHostCopy.size()) { - PX_ASSERT(destConstraints.size() == dxCloth.mMotionConstraints.mHostCopy.size()); + NV_CLOTH_ASSERT(destConstraints.size() == dxCloth.mMotionConstraints.mHostCopy.size()); memcpy(destConstraints.begin(), dxCloth.mMotionConstraints.mHostCopy.begin(), sizeof(PxVec4) * dxCloth.mMotionConstraints.mHostCopy.size()); @@ -315,20 +315,20 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p ? dxCloth.mMotionConstraints.mTarget : dxCloth.mMotionConstraints.mStart; - PX_ASSERT(destConstraints.size() == srcConstraints.size()); + NV_CLOTH_ASSERT(destConstraints.size() == srcConstraints.size()); copyToHost(destConstraints.begin(), srcConstraints.buffer(), 0, destConstraints.size() * sizeof(PxVec4)); } } void cloth::DxFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const { - PX_ASSERT(&cloth.getFactory() == this); + NV_CLOTH_ASSERT(&cloth.getFactory() == this); const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth); if (dxCloth.mSeparationConstraints.mHostCopy.size()) { - PX_ASSERT(destConstraints.size() == dxCloth.mSeparationConstraints.mHostCopy.size()); + NV_CLOTH_ASSERT(destConstraints.size() == dxCloth.mSeparationConstraints.mHostCopy.size()); memcpy(destConstraints.begin(), dxCloth.mSeparationConstraints.mHostCopy.begin(), sizeof(PxVec4) * dxCloth.mSeparationConstraints.mHostCopy.size()); @@ -341,7 +341,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p ? dxCloth.mSeparationConstraints.mTarget : dxCloth.mSeparationConstraints.mStart; - PX_ASSERT(destConstraints.size() == srcConstraints.size()); + NV_CLOTH_ASSERT(destConstraints.size() == srcConstraints.size()); copyToHost(destConstraints.begin(), srcConstraints.buffer(), 0, destConstraints.size() * sizeof(PxVec4)); } @@ -350,12 +350,12 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p void cloth::DxFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const { /* - PX_ASSERT(&cloth.getFactory() == this); + NV_CLOTH_ASSERT(&cloth.getFactory() == this); const DxCloth& dxCloth = static_cast<const DxClothImpl&>(cloth).mCloth; if (dxCloth.mParticleAccelerationsHostCopy.size()) { - PX_ASSERT(dxCloth.mParticleAccelerationsHostCopy.size()); + NV_CLOTH_ASSERT(dxCloth.mParticleAccelerationsHostCopy.size()); memcpy(destAccelerations.begin(), dxCloth.mParticleAccelerationsHostCopy.begin(), sizeof(PxVec4) * dxCloth.mParticleAccelerationsHostCopy.size()); @@ -366,20 +366,20 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p DxBatchedVector<PxVec4> const& srcAccelerations = dxCloth.mParticleAccelerations; - PX_ASSERT(destAccelerations.size() == srcAccelerations.size()); + NV_CLOTH_ASSERT(destAccelerations.size() == srcAccelerations.size()); copyToHost(destAccelerations.begin(), srcAccelerations.buffer(), 0, destAccelerations.size() * sizeof(PxVec4)); } */ PX_UNUSED(&cloth); PX_UNUSED(&destAccelerations); - PX_ASSERT(0); + NV_CLOTH_ASSERT(0); } void cloth::DxFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, Range<PxVec3> destWeights) const { - PX_ASSERT(&cloth.getFactory() == this); + NV_CLOTH_ASSERT(&cloth.getFactory() == this); DxContextLock contextLock(*this); @@ -400,7 +400,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p for (; srcIt != srcEnd; ++srcIt, ++destIt) *destIt = reinterpret_cast<const PxVec3&>(*srcIt); - PX_ASSERT(destIt <= destWeights.end()); + NV_CLOTH_ASSERT(destIt <= destWeights.end()); } if (destIndices.size() > 0) @@ -418,14 +418,14 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p for (; srcIt != srcEnd; ++srcIt, ++destIt) *destIt = Vec4u(*srcIt); - PX_ASSERT(&array(*destIt) <= destIndices.end()); + NV_CLOTH_ASSERT(&array(*destIt) <= destIndices.end()); } } void cloth::DxFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const { const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth); - PX_ASSERT(destIndices.size() == dxCloth.mSelfCollisionIndices.size()); + NV_CLOTH_ASSERT(destIndices.size() == dxCloth.mSelfCollisionIndices.size()); intrinsics::memCopy(destIndices.begin(), dxCloth.mSelfCollisionIndicesHost.begin(), destIndices.size() * sizeof(uint32_t)); } @@ -433,7 +433,7 @@ void cloth::DxFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> p void cloth::DxFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const { const DxCloth& dxCloth = static_cast<const DxCloth&>(cloth); - PX_ASSERT(destRestPositions.size() == dxCloth.mRestPositions.size()); + NV_CLOTH_ASSERT(destRestPositions.size() == dxCloth.mRestPositions.size()); intrinsics::memCopy(destRestPositions.begin(), DxCloth::MappedVec4fVectorType(const_cast<DxCloth&>(dxCloth).mRestPositions).begin(), destRestPositions.size() * sizeof(PxVec4)); } diff --git a/NvCloth/src/dx/DxSolver.cpp b/NvCloth/src/dx/DxSolver.cpp index ab030d5..66a8d8f 100644 --- a/NvCloth/src/dx/DxSolver.cpp +++ b/NvCloth/src/dx/DxSolver.cpp @@ -113,26 +113,9 @@ struct ClothSimCostGreater void cloth::DxSolver::addCloth(Cloth* cloth) { - DxCloth& dxCloth = static_cast<DxCloth&>(*cloth); - - NV_CLOTH_ASSERT(mCloths.find(&dxCloth) == mCloths.end()); - - mCloths.pushBack(&dxCloth); - // trigger update of mClothData array - dxCloth.notifyChanged(); - - // sort cloth instances by size - shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator()); - - DxContextLock contextLock(mFactory); - - // resize containers and update kernel data - mClothDataHostCopy.resize(mCloths.size()); - mClothData.resize(mCloths.size()); - mFrameDataHostCopy.resize(mCloths.size()); - - // lazy compilation of compute shader - mComputeError |= mFactory.mSolverKernelComputeShader == nullptr; + addClothAppend(cloth); + addClothUpdateData(); + #if 0 if (!mSortComputeShader && !mComputeError) { @@ -197,7 +180,7 @@ void cloth::DxSolver::addCloth(Cloth* cloth) { uint32_t key = sortElems[i] & ~0xffff; uint32_t keyRef = _SortElemsRef[i] & ~0xffff; - PX_ASSERT(key == keyRef); + NV_CLOTH_ASSERT(key == keyRef); } _SortElemsHostCopy.unmap(); } @@ -208,6 +191,15 @@ void cloth::DxSolver::addCloth(Cloth* cloth) #endif } +void cloth::DxSolver::addCloths(Range<Cloth*> cloths) +{ + for (uint32_t i = 0; i < cloths.size(); ++i) + { + addClothAppend(*(cloths.begin() + i)); + } + addClothUpdateData(); +} + void cloth::DxSolver::removeCloth(Cloth* cloth) { DxCloth& dxCloth = static_cast<DxCloth&>(*cloth); @@ -232,10 +224,9 @@ int cloth::DxSolver::getNumCloths() const } cloth::Cloth * const * cloth::DxSolver::getClothList() const { - if(getNumCloths()) + if (getNumCloths() != 0) return reinterpret_cast<Cloth* const*>(&mCloths[0]); - else - return nullptr; + return nullptr; } bool cloth::DxSolver::beginSimulation(float dt) @@ -260,7 +251,34 @@ void cloth::DxSolver::endSimulation() } int cloth::DxSolver::getSimulationChunkCount() const { - return 1; + // 0 chunks when no cloth present in the solver, 1 otherwise + return getNumCloths() != 0; +} + +void cloth::DxSolver::addClothAppend(Cloth* cloth) +{ + DxCloth& dxCloth = static_cast<DxCloth&>(*cloth); + NV_CLOTH_ASSERT(mCloths.find(&dxCloth) == mCloths.end()); + + mCloths.pushBack(&dxCloth); + // trigger update of mClothData array + dxCloth.notifyChanged(); +} + +void cloth::DxSolver::addClothUpdateData() +{ + // sort cloth instances by size + shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater(), NonTrackingAllocator()); + + DxContextLock contextLock(mFactory); + + // resize containers and update kernel data + mClothDataHostCopy.resize(mCloths.size()); + mClothData.resize(mCloths.size()); + mFrameDataHostCopy.resize(mCloths.size()); + + // lazy compilation of compute shader + mComputeError |= mFactory.mSolverKernelComputeShader == nullptr; } void cloth::DxSolver::beginFrame() diff --git a/NvCloth/src/dx/DxSolver.h b/NvCloth/src/dx/DxSolver.h index 09f523a..07d77dc 100644 --- a/NvCloth/src/dx/DxSolver.h +++ b/NvCloth/src/dx/DxSolver.h @@ -56,6 +56,7 @@ class DxSolver : private DxContextLock, public Solver ~DxSolver(); virtual void addCloth(Cloth*) override; + virtual void addCloths(Range<Cloth*> cloths) override; virtual void removeCloth(Cloth*) override; virtual int getNumCloths() const override; virtual Cloth * const * getClothList() const override; @@ -101,6 +102,10 @@ class DxSolver : private DxContextLock, public Solver } private: + // add cloth helper functions + void addClothAppend(Cloth* cloth); + void addClothUpdateData(); + // simulate helper functions void beginFrame(); void executeKernel(); diff --git a/NvCloth/src/neon/NeonSolverKernel.cpp b/NvCloth/src/neon/NeonSolverKernel.cpp index 4d6de68..3e16b6f 100644 --- a/NvCloth/src/neon/NeonSolverKernel.cpp +++ b/NvCloth/src/neon/NeonSolverKernel.cpp @@ -35,15 +35,19 @@ #include <cpu-features.h> -namespace physx +namespace +{ + const bool sNeonSupport = ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures(); +} + +namespace nv { namespace cloth { bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator, - IterationStateFactory& factory, PxProfileZone* profileZone) + IterationStateFactory& factory) { - return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() && - (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true); + return sNeonSupport && (SwSolverKernel<Simd4f>(cloth, data, allocator, factory)(), true); } } } diff --git a/NvCloth/src/scalar/SwCollisionHelpers.h b/NvCloth/src/scalar/SwCollisionHelpers.h index af21812..3ab756f 100644 --- a/NvCloth/src/scalar/SwCollisionHelpers.h +++ b/NvCloth/src/scalar/SwCollisionHelpers.h @@ -29,6 +29,8 @@ #pragma once +#include "PsMathUtils.h" + namespace nv { namespace cloth @@ -46,6 +48,7 @@ uint32_t findBitSet(uint32_t mask) inline Scalar4i intFloor(const Scalar4f& v) { + using physx::shdfnd::floor; return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3]))); } diff --git a/NvCloth/src/sse2/SwCollisionHelpers.h b/NvCloth/src/sse2/SwCollisionHelpers.h index c80ba1d..b759868 100644 --- a/NvCloth/src/sse2/SwCollisionHelpers.h +++ b/NvCloth/src/sse2/SwCollisionHelpers.h @@ -63,12 +63,15 @@ Simd4i intFloor(const Simd4f& v) { Simd4i i = _mm_cvttps_epi32(v); return _mm_sub_epi32(i, _mm_srli_epi32(simd4i(v), 31)); + //Simd4i i = truncate(v); + //return i - (simd4i(v) >> 31); } Simd4i horizontalOr(const Simd4i& mask) { Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y return tmp | _mm_shuffle_epi32(tmp, 0x4e); // w z y x -> y x w z +// return splat<0>(mask) | splat<1>(mask) | splat<2>(mask) | splat<3>(mask); } Gather<Simd4i>::Gather(const Simd4i& index) |