// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2018 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "SwCollision.h" #include "SwCloth.h" #include "SwClothData.h" #include "IterationState.h" #include "BoundingBox.h" #include "PointInterpolator.h" #include "SwCollisionHelpers.h" #include "PxAssert.h" #include // for memset using namespace nvidia; // the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision template const Simd4f cloth::SwCollision::sSkeletonWidth = simd4f(sqr(1 - 0.2f) - 1); #if NVMATH_SSE2 const Simd4i cloth::Gather::sIntSignBit = simd4i(_sign); const Simd4i cloth::Gather::sSignedMask = sIntSignBit | simd4i(0x7); #elif NVMATH_NEON const Simd4i cloth::Gather::sPack = simd4i(0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c); const Simd4i cloth::Gather::sOffset = simd4i(0x03020100); const Simd4i cloth::Gather::sShift = simd4i(detail::IntType<2>()); const Simd4i cloth::Gather::sMask = simd4i(detail::IntType<7>()); #endif namespace { typedef Simd4fFactory Simd4fConstant; const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON); const Simd4fConstant sMax = simd4f(FLT_MAX); const Simd4fConstant sMaskX = simd4f(simd4i(~0, 0, 0, 0)); const Simd4fConstant sMaskZ = simd4f(simd4i(0, 0, ~0, 0)); const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0)); const Simd4fConstant sZero = simd4f(0.0f); const Simd4fConstant sOne = simd4f(1.0f); const Simd4fConstant sNegOne = simd4f(-1.0f); const Simd4fConstant sHalf = simd4f(0.5f); const Simd4fConstant sOneXYZ = simd4f(1.0f, 1.0f, 1.0f, 0.0f); const Simd4fConstant sGridLength = simd4f(8 - 1e-3f); // sGridSize const Simd4fConstant sGridExpand = simd4f(1e-4f); const Simd4fConstant sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); #if PX_PROFILE || PX_DEBUG template uint32_t horizontalSum(const Simd4f& x) { const float* p = array(x); return uint32_t(0.5f + p[0] + p[1] + p[2] + p[3]); } #endif // 7 elements are written to ptr! template void storeBounds(float* ptr, const cloth::BoundingBox& bounds) { store(ptr, bounds.mLower); store(ptr + 3, bounds.mUpper); } } struct cloth::SphereData { PxVec3 center; float radius; }; struct cloth::ConeData { PxVec3 center; float radius; // cone radius at center PxVec3 axis; float slope; // tan(alpha) float sqrCosine; // cos^2(alpha) float halfLength; uint32_t firstMask; uint32_t bothMask; }; struct cloth::TriangleData { PxVec3 base; float edge0DotEdge1; PxVec3 edge0; float edge0SqrLength; PxVec3 edge1; float edge1SqrLength; PxVec3 normal; float padding; float det; float denom; float edge0InvSqrLength; float edge1InvSqrLength; }; namespace nvidia { namespace cloth { template BoundingBox expandBounds(const BoundingBox& bbox, const SphereData* sIt, const SphereData* sEnd) { BoundingBox result = bbox; for(; sIt != sEnd; ++sIt) { Simd4f p = loadAligned(array(sIt->center)); Simd4f r = splat<3>(p); result.mLower = min(result.mLower, p - r); result.mUpper = max(result.mUpper, p + r); } return result; } } } namespace { template void generateSpheres(Simd4f* dIt, const SrcIterator& src, uint32_t count) { // have to copy out iterator to ensure alignment is maintained for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt) *dIt = max(sMinusFloatMaxXYZ, *sIt); // clamp radius to 0 } void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, const cloth::IndexPair* capsuleIndices, uint32_t numCones) { cloth::ConeData* cIt = dst; for(const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt) { PxVec4 first = reinterpret_cast(sourceSpheres[iIt->first]); PxVec4 second = reinterpret_cast(sourceSpheres[iIt->second]); PxVec4 center = (second + first) * 0.5f; PxVec4 axis = (second - first) * 0.5f; float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z; float sqrConeLength = sqrAxisLength - sqr(axis.w); float invAxisLength = 1 / sqrtf(sqrAxisLength); float invConeLength = 1 / sqrtf(sqrConeLength); if(sqrConeLength <= 0.0f) invAxisLength = invConeLength = 0.0f; float axisLength = sqrAxisLength * invAxisLength; float slope = axis.w * invConeLength; cIt->center = PxVec3(center.x, center.y, center.z); cIt->radius = (axis.w + first.w) * invConeLength * axisLength; cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength; cIt->slope = slope; cIt->sqrCosine = 1.0f - sqr(axis.w * invAxisLength); cIt->halfLength = axisLength; uint32_t firstMask = 0x1u << iIt->first; cIt->firstMask = firstMask; cIt->bothMask = firstMask | 0x1u << iIt->second; } } template void generatePlanes(Simd4f* dIt, const SrcIterator& src, uint32_t count) { // have to copy out iterator to ensure alignment is maintained for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt) *dIt = *sIt; } template void generateTriangles(cloth::TriangleData* dIt, const SrcIterator& src, uint32_t count) { // have to copy out iterator to ensure alignment is maintained for(SrcIterator sIt = src; 0 < count--; ++dIt) { Simd4f p0 = *sIt; ++sIt; Simd4f p1 = *sIt; ++sIt; Simd4f p2 = *sIt; ++sIt; Simd4f edge0 = p1 - p0; Simd4f edge1 = p2 - p0; Simd4f normal = cross3(edge0, edge1); Simd4f edge0SqrLength = dot3(edge0, edge0); Simd4f edge1SqrLength = dot3(edge1, edge1); Simd4f edge0DotEdge1 = dot3(edge0, edge1); Simd4f normalInvLength = rsqrt(dot3(normal, normal)); Simd4f det = edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1; Simd4f denom = edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1; // there are definitely faster ways... Simd4f aux = select(sMaskX, det, denom); aux = select(sMaskZ, edge0SqrLength, aux); aux = select(sMaskW, edge1SqrLength, aux); storeAligned(&dIt->base.x, select(sMaskW, edge0DotEdge1, p0)); storeAligned(&dIt->edge0.x, select(sMaskW, edge0SqrLength, edge0)); storeAligned(&dIt->edge1.x, select(sMaskW, edge1SqrLength, edge1)); storeAligned(&dIt->normal.x, normal * normalInvLength); storeAligned(&dIt->det, recipT<1>(aux)); } } } // namespace template cloth::SwCollision::CollisionData::CollisionData() : mSpheres(0), mCones(0) { } template cloth::SwCollision::SwCollision(SwClothData& clothData, SwKernelAllocator& alloc, profile::PxProfileZone* profiler) : mClothData(clothData), mAllocator(alloc), mProfiler(profiler) { allocate(mCurData); if(mClothData.mEnableContinuousCollision || mClothData.mFrictionScale > 0.0f) { allocate(mPrevData); generateSpheres(reinterpret_cast(mPrevData.mSpheres), reinterpret_cast(clothData.mStartCollisionSpheres), clothData.mNumSpheres); generateCones(mPrevData.mCones, mPrevData.mSpheres, clothData.mCapsuleIndices, clothData.mNumCapsules); } } template cloth::SwCollision::~SwCollision() { deallocate(mCurData); deallocate(mPrevData); } template void cloth::SwCollision::operator()(const IterationState& state) { mNumCollisions = 0; collideConvexes(state); // discrete convex collision, no friction collideTriangles(state); // discrete triangle collision, no friction computeBounds(); if(!mClothData.mNumSpheres) return; bool lastIteration = state.mRemainingIterations == 1; const Simd4f* targetSpheres = reinterpret_cast(mClothData.mTargetCollisionSpheres); // generate sphere and cone collision data if(!lastIteration) { // interpolate spheres LerpIterator pIter(reinterpret_cast(mClothData.mStartCollisionSpheres), targetSpheres, state.getCurrentAlpha()); generateSpheres(reinterpret_cast(mCurData.mSpheres), pIter, mClothData.mNumSpheres); } else { // otherwise use the target spheres directly generateSpheres(reinterpret_cast(mCurData.mSpheres), targetSpheres, mClothData.mNumSpheres); } // generate cones even if test below fails because // continuous collision might need it in next iteration generateCones(mCurData.mCones, mCurData.mSpheres, mClothData.mCapsuleIndices, mClothData.mNumCapsules); if(buildAcceleration()) { if(mClothData.mEnableContinuousCollision) collideContinuousParticles(); mergeAcceleration((uint32_t*)mSphereGrid); mergeAcceleration((uint32_t*)mConeGrid); if(!mClothData.mEnableContinuousCollision) collideParticles(); collideVirtualParticles(); } if(mPrevData.mSpheres) nvidia::swap(mCurData, mPrevData); } template size_t cloth::SwCollision::estimateTemporaryMemory(const SwCloth& cloth) { size_t numTriangles = cloth.mStartCollisionTriangles.size(); size_t numPlanes = cloth.mStartCollisionPlanes.size(); const size_t kTriangleDataSize = sizeof(TriangleData) * numTriangles; const size_t kPlaneDataSize = sizeof(PxVec4) * numPlanes * 2; return PxMax(kTriangleDataSize, kPlaneDataSize); } template size_t cloth::SwCollision::estimatePersistentMemory(const SwCloth& cloth) { size_t numCapsules = cloth.mCapsuleIndices.size(); size_t numSpheres = cloth.mStartCollisionSpheres.size(); size_t sphereDataSize = sizeof(SphereData) * numSpheres * 2; size_t coneDataSize = sizeof(ConeData) * numCapsules * 2; return sphereDataSize + coneDataSize; } template void cloth::SwCollision::allocate(CollisionData& data) { data.mSpheres = static_cast(mAllocator.allocate(sizeof(SphereData) * mClothData.mNumSpheres)); data.mCones = static_cast(mAllocator.allocate(sizeof(ConeData) * mClothData.mNumCapsules)); } template void cloth::SwCollision::deallocate(const CollisionData& data) { mAllocator.deallocate(data.mSpheres); mAllocator.deallocate(data.mCones); } template void cloth::SwCollision::computeBounds() { #if PX_PROFILE ProfileZone zone("cloth::SwSolverKernel::computeBounds", mProfiler); #endif Simd4f* prevIt = reinterpret_cast(mClothData.mPrevParticles); Simd4f* curIt = reinterpret_cast(mClothData.mCurParticles); Simd4f* curEnd = curIt + mClothData.mNumParticles; Simd4f floatMaxXYZ = -(Simd4f)sMinusFloatMaxXYZ; Simd4f lower = simd4f(FLT_MAX), upper = -lower; for(; curIt < curEnd; ++curIt, ++prevIt) { Simd4f current = *curIt; lower = min(lower, current); upper = max(upper, current); // if(current.w > 0) current.w = previous.w *curIt = select(current > floatMaxXYZ, *prevIt, current); } BoundingBox curBounds; curBounds.mLower = lower; curBounds.mUpper = upper; // don't change this order, storeBounds writes 7 floats BoundingBox prevBounds = loadBounds(mClothData.mCurBounds); storeBounds(mClothData.mCurBounds, curBounds); storeBounds(mClothData.mPrevBounds, prevBounds); } namespace { template Simd4i andNotIsZero(const Simd4i& left, const Simd4i& right) { return simdi::operator==(left & ~right, simd4i(_0)); } } // build per-axis mask arrays of spheres on the right/left of grid cell template void cloth::SwCollision::buildSphereAcceleration(const SphereData* sIt) { static const int maxIndex = sGridSize - 1; const SphereData* sEnd = sIt + mClothData.mNumSpheres; for(uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1) { Simd4f sphere = loadAligned(array(sIt->center)); Simd4f radius = splat<3>(sphere); Simd4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, sZero)); Simd4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength)); const int* firstIdx = simdi::array(first); const int* lastIdx = simdi::array(last); uint32_t* firstIt = (uint32_t*)mSphereGrid; uint32_t* lastIt = firstIt + 3 * sGridSize; for(uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize) { for(int j = firstIdx[i]; j <= maxIndex; ++j) firstIt[j] |= mask; for(int j = lastIdx[i]; j >= 0; --j) lastIt[j] |= mask; } } } // generate cone masks from sphere masks template void cloth::SwCollision::buildConeAcceleration() { const ConeData* coneIt = mCurData.mCones; const ConeData* coneEnd = coneIt + mClothData.mNumCapsules; for(uint32_t coneMask = 0x1; coneIt != coneEnd; ++coneIt, coneMask <<= 1) { if(coneIt->radius == 0.0f) continue; uint32_t spheresMask = coneIt->bothMask; uint32_t* sphereIt = (uint32_t*)mSphereGrid; uint32_t* sphereEnd = sphereIt + 6 * sGridSize; uint32_t* gridIt = (uint32_t*)mConeGrid; for(; sphereIt != sphereEnd; ++sphereIt, ++gridIt) if(*sphereIt & spheresMask) *gridIt |= coneMask; } } // convert right/left mask arrays into single overlap array template void cloth::SwCollision::mergeAcceleration(uint32_t* firstIt) { uint32_t* firstEnd = firstIt + 3 * sGridSize; uint32_t* lastIt = firstEnd; for(; firstIt != firstEnd; ++firstIt, ++lastIt) *firstIt &= *lastIt; } // build mask of spheres/cones touching a regular grid along each axis template bool cloth::SwCollision::buildAcceleration() { // determine sphere bbox BoundingBox sphereBounds = expandBounds(emptyBounds(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres); BoundingBox particleBounds = loadBounds(mClothData.mCurBounds); if(mClothData.mEnableContinuousCollision) { sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres); particleBounds = expandBounds(particleBounds, loadBounds(mClothData.mPrevBounds)); } BoundingBox bounds = intersectBounds(sphereBounds, particleBounds); Simd4f edgeLength = (bounds.mUpper - bounds.mLower) & ~(Simd4f)sMaskW; if(!allGreaterEqual(edgeLength, simd4f(_0))) return false; // calculate an expanded bounds to account for numerical inaccuracy const Simd4f expandedLower = bounds.mLower - abs(bounds.mLower) * sGridExpand; const Simd4f expandedUpper = bounds.mUpper + abs(bounds.mUpper) * sGridExpand; const Simd4f expandedEdgeLength = max(expandedUpper - expandedLower, sEpsilon); // make grid minimal thickness and strict upper bound of spheres mGridScale = sGridLength * recipT<1>(expandedEdgeLength); mGridBias = -expandedLower * mGridScale; array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles() PX_ASSERT(allTrue(((bounds.mLower * mGridScale + mGridBias) >= simd4f(0.0f)) | sMaskW)); PX_ASSERT(allTrue(((bounds.mUpper * mGridScale + mGridBias) < simd4f(8.0f)) | sMaskW)); memset(mSphereGrid, 0, sizeof(uint32_t) * 6 * (sGridSize)); if(mClothData.mEnableContinuousCollision) buildSphereAcceleration(mPrevData.mSpheres); buildSphereAcceleration(mCurData.mSpheres); memset(mConeGrid, 0, sizeof(uint32_t) * 6 * (sGridSize)); buildConeAcceleration(); return true; } #ifdef _MSC_VER #define FORCE_INLINE __forceinline #else #define FORCE_INLINE inline __attribute__((always_inline)) #endif template FORCE_INLINE typename cloth::SwCollision::ShapeMask& cloth::SwCollision::ShapeMask:: operator=(const ShapeMask& right) { mCones = right.mCones; mSpheres = right.mSpheres; return *this; } template FORCE_INLINE typename cloth::SwCollision::ShapeMask& cloth::SwCollision::ShapeMask:: operator&=(const ShapeMask& right) { mCones = mCones & right.mCones; mSpheres = mSpheres & right.mSpheres; return *this; } template FORCE_INLINE typename cloth::SwCollision::ShapeMask cloth::SwCollision::getShapeMask(const Simd4f& position, const Simd4i* __restrict sphereGrid, const Simd4i* __restrict coneGrid) { Gather gather(intFloor(position)); ShapeMask result; result.mCones = gather(coneGrid); result.mSpheres = gather(sphereGrid); return result; } // lookup acceleration structure and return mask of potential intersectors template FORCE_INLINE typename cloth::SwCollision::ShapeMask cloth::SwCollision::getShapeMask(const Simd4f* __restrict positions) const { Simd4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias); Simd4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias); Simd4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias); ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid); result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2); result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4); return result; } // lookup acceleration structure and return mask of potential intersectors template FORCE_INLINE typename cloth::SwCollision::ShapeMask cloth::SwCollision::getShapeMask(const Simd4f* __restrict prevPos, const Simd4f* __restrict curPos) const { Simd4f scaleX = splat<0>(mGridScale); Simd4f scaleY = splat<1>(mGridScale); Simd4f scaleZ = splat<2>(mGridScale); Simd4f biasX = splat<0>(mGridBias); Simd4f biasY = splat<1>(mGridBias); Simd4f biasZ = splat<2>(mGridBias); Simd4f prevX = prevPos[0] * scaleX + biasX; Simd4f prevY = prevPos[1] * scaleY + biasY; Simd4f prevZ = prevPos[2] * scaleZ + biasZ; Simd4f curX = curPos[0] * scaleX + biasX; Simd4f curY = curPos[1] * scaleY + biasY; Simd4f curZ = curPos[2] * scaleZ + biasZ; Simd4f maxX = min(max(prevX, curX), sGridLength); Simd4f maxY = min(max(prevY, curY), sGridLength); Simd4f maxZ = min(max(prevZ, curZ), sGridLength); ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid); result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2); result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4); Simd4f zero = simd4f(_0); Simd4f minX = max(min(prevX, curX), zero); Simd4f minY = max(min(prevY, curY), zero); Simd4f minZ = max(min(prevZ, curZ), zero); result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6); result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8); result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10); return result; } template struct cloth::SwCollision::ImpulseAccumulator { ImpulseAccumulator() : mDeltaX(simd4f(_0)) , mDeltaY(mDeltaX) , mDeltaZ(mDeltaX) , mVelX(mDeltaX) , mVelY(mDeltaX) , mVelZ(mDeltaX) , mNumCollisions(sEpsilon) { } void add(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask) { PX_ASSERT(allTrue((mask & x) == (mask & x))); PX_ASSERT(allTrue((mask & y) == (mask & y))); PX_ASSERT(allTrue((mask & z) == (mask & z))); PX_ASSERT(allTrue((mask & scale) == (mask & scale))); Simd4f maskedScale = scale & mask; mDeltaX = mDeltaX + x * maskedScale; mDeltaY = mDeltaY + y * maskedScale; mDeltaZ = mDeltaZ + z * maskedScale; mNumCollisions = mNumCollisions + (simd4f(_1) & mask); } void addVelocity(const Simd4f& vx, const Simd4f& vy, const Simd4f& vz, const Simd4f& mask) { PX_ASSERT(allTrue((mask & vx) == (mask & vx))); PX_ASSERT(allTrue((mask & vy) == (mask & vy))); PX_ASSERT(allTrue((mask & vz) == (mask & vz))); mVelX = mVelX + (vx & mask); mVelY = mVelY + (vy & mask); mVelZ = mVelZ + (vz & mask); } void subtract(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask) { PX_ASSERT(allTrue((mask & x) == (mask & x))); PX_ASSERT(allTrue((mask & y) == (mask & y))); PX_ASSERT(allTrue((mask & z) == (mask & z))); PX_ASSERT(allTrue((mask & scale) == (mask & scale))); Simd4f maskedScale = scale & mask; mDeltaX = mDeltaX - x * maskedScale; mDeltaY = mDeltaY - y * maskedScale; mDeltaZ = mDeltaZ - z * maskedScale; mNumCollisions = mNumCollisions + (simd4f(_1) & mask); } Simd4f mDeltaX, mDeltaY, mDeltaZ; Simd4f mVelX, mVelY, mVelZ; Simd4f mNumCollisions; }; template FORCE_INLINE void cloth::SwCollision::collideSpheres(const Simd4i& sphereMask, const Simd4f* positions, ImpulseAccumulator& accum) const { const float* __restrict spherePtr = array(mCurData.mSpheres->center); bool frictionEnabled = mClothData.mFrictionScale > 0.0f; Simd4i mask4 = horizontalOr(sphereMask); uint32_t mask = uint32_t(simdi::array(mask4)[0]); while(mask) { uint32_t test = mask - 1; uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData); mask = mask & test; Simd4f sphere = loadAligned(spherePtr, offset); Simd4f deltaX = positions[0] - splat<0>(sphere); Simd4f deltaY = positions[1] - splat<1>(sphere); Simd4f deltaZ = positions[2] - splat<2>(sphere); Simd4f sqrDistance = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * splat<3>(sphere); Simd4f contactMask; if(!anyGreater(simd4f(_0), negativeScale, contactMask)) continue; accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask); if(frictionEnabled) { // load previous sphere pos const float* __restrict prevSpherePtr = array(mPrevData.mSpheres->center); Simd4f prevSphere = loadAligned(prevSpherePtr, offset); Simd4f velocity = sphere - prevSphere; accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask); } } } template FORCE_INLINE typename cloth::SwCollision::Simd4i cloth::SwCollision::collideCones(const Simd4f* __restrict positions, ImpulseAccumulator& accum) const { const float* __restrict centerPtr = array(mCurData.mCones->center); const float* __restrict axisPtr = array(mCurData.mCones->axis); const float* __restrict auxiliaryPtr = &mCurData.mCones->sqrCosine; bool frictionEnabled = mClothData.mFrictionScale > 0.0f; ShapeMask shapeMask = getShapeMask(positions); Simd4i mask4 = horizontalOr(shapeMask.mCones); uint32_t mask = uint32_t(simdi::array(mask4)[0]); while(mask) { uint32_t test = mask - 1; uint32_t coneIndex = findBitSet(mask & ~test); uint32_t offset = coneIndex * sizeof(ConeData); mask = mask & test; Simd4i test4 = simdi::operator-(mask4, simd4i(_1)); Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4)); mask4 = mask4 & test4; Simd4f center = loadAligned(centerPtr, offset); Simd4f deltaX = positions[0] - splat<0>(center); Simd4f deltaY = positions[1] - splat<1>(center); Simd4f deltaZ = positions[2] - splat<2>(center); Simd4f axis = loadAligned(axisPtr, offset); Simd4f axisX = splat<0>(axis); Simd4f axisY = splat<1>(axis); Simd4f axisZ = splat<2>(axis); Simd4f slope = splat<3>(axis); Simd4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; Simd4f radius = dot * slope + splat<3>(center); // set radius to zero if cone is culled radius = max(radius, sZero) & ~culled; Simd4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot; Simd4i auxiliary = simd4i((Simd4f)loadAligned(auxiliaryPtr, offset)); Simd4i bothMask = splat<3>(auxiliary); Simd4f contactMask; if(!anyGreater(radius * radius, sqrDistance, contactMask)) { // cone only culled when spheres culled, ok to clear those too shapeMask.mSpheres = shapeMask.mSpheres & ~bothMask; continue; } // clamp to a small positive epsilon to avoid numerical error // making sqrDistance negative when point lies on the cone axis sqrDistance = max(sqrDistance, sEpsilon); Simd4f invDistance = rsqrt(sqrDistance); Simd4f base = dot + slope * sqrDistance * invDistance; // force left/rightMask to false if not inside cone base = base & contactMask; Simd4f halfLength = splat<1>(simd4f(auxiliary)); Simd4i leftMask = simd4i(base < -halfLength); Simd4i rightMask = simd4i(base > halfLength); // we use both mask because of the early out above. Simd4i firstMask = splat<2>(auxiliary); Simd4i secondMask = firstMask ^ bothMask; shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask); shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask); deltaX = deltaX - base * axisX; deltaY = deltaY - base * axisY; deltaZ = deltaZ - base * axisZ; Simd4f sqrCosine = splat<0>(simd4f(auxiliary)); Simd4f scale = radius * invDistance * sqrCosine - sqrCosine; contactMask = contactMask & ~simd4f(leftMask | rightMask); if(!anyTrue(contactMask)) continue; accum.add(deltaX, deltaY, deltaZ, scale, contactMask); if(frictionEnabled) { uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first; uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second; float* prevSpheres = reinterpret_cast(mPrevData.mSpheres); float* curSpheres = reinterpret_cast(mCurData.mSpheres); // todo: could pre-compute sphere velocities or it might be // faster to compute cur/prev sphere positions directly Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData)); Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData)); Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData)); Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData)); Simd4f v0 = s0p1 - s0p0; Simd4f v1 = s1p1 - s1p0; Simd4f vd = v1 - v0; // dot is in the range -1 to 1, scale and bias to 0 to 1 dot = dot * sHalf + sHalf; // interpolate velocity at contact points Simd4f vx = splat<0>(v0) + dot * splat<0>(vd); Simd4f vy = splat<1>(v0) + dot * splat<1>(vd); Simd4f vz = splat<2>(v0) + dot * splat<2>(vd); accum.addVelocity(vx, vy, vz, contactMask); } } return shapeMask.mSpheres; } template FORCE_INLINE void cloth::SwCollision::collideSpheres(const Simd4i& sphereMask, const Simd4f* __restrict prevPos, Simd4f* __restrict curPos, ImpulseAccumulator& accum) const { const float* __restrict prevSpheres = array(mPrevData.mSpheres->center); const float* __restrict curSpheres = array(mCurData.mSpheres->center); bool frictionEnabled = mClothData.mFrictionScale > 0.0f; Simd4i mask4 = horizontalOr(sphereMask); uint32_t mask = uint32_t(simdi::array(mask4)[0]); while(mask) { uint32_t test = mask - 1; uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData); mask = mask & test; Simd4f prevSphere = loadAligned(prevSpheres, offset); Simd4f prevX = prevPos[0] - splat<0>(prevSphere); Simd4f prevY = prevPos[1] - splat<1>(prevSphere); Simd4f prevZ = prevPos[2] - splat<2>(prevSphere); Simd4f prevRadius = splat<3>(prevSphere); Simd4f curSphere = loadAligned(curSpheres, offset); Simd4f curX = curPos[0] - splat<0>(curSphere); Simd4f curY = curPos[1] - splat<1>(curSphere); Simd4f curZ = curPos[2] - splat<2>(curSphere); Simd4f curRadius = splat<3>(curSphere); Simd4f sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ; Simd4f dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius; Simd4f dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius; Simd4f dotCurCur = sqrDistance - curRadius * curRadius; Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; Simd4f sqrtD = sqrt(discriminant); Simd4f halfB = dotPrevCur - dotPrevPrev; Simd4f minusA = dotPrevCur - dotCurCur + halfB; // time of impact or 0 if prevPos inside sphere Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD); Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD); // skip continuous collision if the (un-clamped) particle // trajectory only touches the outer skin of the cone. Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth); // a is negative when one sphere is contained in the other, // which is already handled by discrete collision. collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon); if(!allEqual(collisionMask, simd4f(_0))) { Simd4f deltaX = prevX - curX; Simd4f deltaY = prevY - curY; Simd4f deltaZ = prevZ - curZ; Simd4f oneMinusToi = (simd4f(_1) - toi) & collisionMask; // reduce ccd impulse if (clamped) particle trajectory stays in sphere skin, // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > sEpsilon); oneMinusToi = oneMinusToi * recip(sOne - minusK); curX = curX + deltaX * oneMinusToi; curY = curY + deltaY * oneMinusToi; curZ = curZ + deltaZ * oneMinusToi; curPos[0] = splat<0>(curSphere) + curX; curPos[1] = splat<1>(curSphere) + curY; curPos[2] = splat<2>(curSphere) + curZ; sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ; } Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * curRadius; Simd4f contactMask; if(!anyGreater(simd4f(_0), negativeScale, contactMask)) continue; accum.subtract(curX, curY, curZ, negativeScale, contactMask); if(frictionEnabled) { Simd4f velocity = curSphere - prevSphere; accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask); } } } template FORCE_INLINE typename cloth::SwCollision::Simd4i cloth::SwCollision::collideCones(const Simd4f* __restrict prevPos, Simd4f* __restrict curPos, ImpulseAccumulator& accum) const { const float* __restrict prevCenterPtr = array(mPrevData.mCones->center); const float* __restrict prevAxisPtr = array(mPrevData.mCones->axis); const float* __restrict prevAuxiliaryPtr = &mPrevData.mCones->sqrCosine; const float* __restrict curCenterPtr = array(mCurData.mCones->center); const float* __restrict curAxisPtr = array(mCurData.mCones->axis); const float* __restrict curAuxiliaryPtr = &mCurData.mCones->sqrCosine; bool frictionEnabled = mClothData.mFrictionScale > 0.0f; ShapeMask shapeMask = getShapeMask(prevPos, curPos); Simd4i mask4 = horizontalOr(shapeMask.mCones); uint32_t mask = uint32_t(simdi::array(mask4)[0]); while(mask) { uint32_t test = mask - 1; uint32_t coneIndex = findBitSet(mask & ~test); uint32_t offset = coneIndex * sizeof(ConeData); mask = mask & test; Simd4i test4 = simdi::operator-(mask4, simd4i(_1)); Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4)); mask4 = mask4 & test4; Simd4f prevCenter = loadAligned(prevCenterPtr, offset); Simd4f prevAxis = loadAligned(prevAxisPtr, offset); Simd4f prevAxisX = splat<0>(prevAxis); Simd4f prevAxisY = splat<1>(prevAxis); Simd4f prevAxisZ = splat<2>(prevAxis); Simd4f prevSlope = splat<3>(prevAxis); Simd4f prevX = prevPos[0] - splat<0>(prevCenter); Simd4f prevY = prevPos[1] - splat<1>(prevCenter); Simd4f prevZ = prevPos[2] - splat<2>(prevCenter); Simd4f prevT = prevY * prevAxisZ - prevZ * prevAxisY; Simd4f prevU = prevZ * prevAxisX - prevX * prevAxisZ; Simd4f prevV = prevX * prevAxisY - prevY * prevAxisX; Simd4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; Simd4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter); Simd4f curCenter = loadAligned(curCenterPtr, offset); Simd4f curAxis = loadAligned(curAxisPtr, offset); Simd4f curAxisX = splat<0>(curAxis); Simd4f curAxisY = splat<1>(curAxis); Simd4f curAxisZ = splat<2>(curAxis); Simd4f curSlope = splat<3>(curAxis); Simd4i curAuxiliary = simd4i((Simd4f)loadAligned(curAuxiliaryPtr, offset)); Simd4f curX = curPos[0] - splat<0>(curCenter); Simd4f curY = curPos[1] - splat<1>(curCenter); Simd4f curZ = curPos[2] - splat<2>(curCenter); Simd4f curT = curY * curAxisZ - curZ * curAxisY; Simd4f curU = curZ * curAxisX - curX * curAxisZ; Simd4f curV = curX * curAxisY - curY * curAxisX; Simd4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; Simd4f curRadius = curDot * curSlope + splat<3>(curCenter); Simd4f curSqrDistance = sEpsilon + curT * curT + curU * curU + curV * curV; // set radius to zero if cone is culled prevRadius = max(prevRadius, simd4f(_0)) & ~culled; curRadius = max(curRadius, simd4f(_0)) & ~culled; Simd4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius; Simd4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius; Simd4f dotCurCur = curSqrDistance - curRadius * curRadius; Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; Simd4f sqrtD = sqrt(discriminant); Simd4f halfB = dotPrevCur - dotPrevPrev; Simd4f minusA = dotPrevCur - dotCurCur + halfB; // time of impact or 0 if prevPos inside cone Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD); Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD); // skip continuous collision if the (un-clamped) particle // trajectory only touches the outer skin of the cone. Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth); // a is negative when one cone is contained in the other, // which is already handled by discrete collision. collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon); // test if any particle hits infinite cone (and 0