diff options
| author | mtamis <[email protected]> | 2017-02-15 16:06:25 +0100 |
|---|---|---|
| committer | mtamis <[email protected]> | 2017-02-15 16:06:25 +0100 |
| commit | 85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch) | |
| tree | 45f1bb20a45a300d1fef107e436cac95602a0e57 /NvCloth/src/SwInterCollision.cpp | |
| download | nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip | |
NvCloth library v1.0.0
Diffstat (limited to 'NvCloth/src/SwInterCollision.cpp')
| -rw-r--r-- | NvCloth/src/SwInterCollision.cpp | 703 |
1 files changed, 703 insertions, 0 deletions
diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp new file mode 100644 index 0000000..6d5e013 --- /dev/null +++ b/NvCloth/src/SwInterCollision.cpp @@ -0,0 +1,703 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "NvCloth/Callbacks.h" +#include "SwInterCollision.h" +#include "SwCollisionHelpers.h" +#include "BoundingBox.h" +#include <foundation/PxMat44.h> +#include <foundation/PxBounds3.h> +#include <algorithm> +#include <PsSort.h> +#include "NvCloth/Allocator.h" + +using namespace nv; +using namespace physx; + +namespace +{ + +const Simd4fTupleFactory sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0)); +const Simd4fTupleFactory sMaskW = simd4f(simd4i(0, 0, 0, ~0)); +const Simd4fScalarFactory sEpsilon = simd4f(FLT_EPSILON); +const Simd4fTupleFactory sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f); + +// returns sorted indices, output needs to be at least 2*(last - first) + 1024 +void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out) +{ + uint32_t n = uint32_t(last - first); + + uint32_t* buffer = out + 2 * n; + uint32_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 }; + + memset(buffer, 0, 1024 * sizeof(uint32_t)); + + // build 3 histograms in one pass + for (const uint32_t* __restrict it = first; it != last; ++it) + { + uint32_t key = *it; + ++histograms[0][0xff & key]; + ++histograms[1][0xff & (key >> 8)]; + ++histograms[2][0xff & (key >> 16)]; + ++histograms[3][key >> 24]; + } + + // convert histograms to offset tables in-place + uint32_t sums[4] = {}; + for (uint32_t i = 0; i < 256; ++i) + { + uint32_t temp0 = histograms[0][i] + sums[0]; + histograms[0][i] = sums[0], sums[0] = temp0; + + uint32_t temp1 = histograms[1][i] + sums[1]; + histograms[1][i] = sums[1], sums[1] = temp1; + + uint32_t temp2 = histograms[2][i] + sums[2]; + histograms[2][i] = sums[2], sums[2] = temp2; + + uint32_t temp3 = histograms[3][i] + sums[3]; + histograms[3][i] = sums[3], sums[3] = temp3; + } + + NV_CLOTH_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n); + +#if PX_DEBUG + memset(out, 0xff, 2 * n * sizeof(uint32_t)); +#endif + + // sort 8 bits per pass + + uint32_t* __restrict indices[] = { out, out + n }; + + for (uint32_t i = 0; i != n; ++i) + indices[1][histograms[0][0xff & first[i]]++] = i; + + for (uint32_t i = 0, index; index = indices[1][i], i != n; ++i) + indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index; + + for (uint32_t i = 0, index; index = indices[0][i], i != n; ++i) + indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index; + + for (uint32_t i = 0, index; index = indices[1][i], i != n; ++i) + indices[0][histograms[3][first[index] >> 24]++] = index; +} + +template <typename Simd4f> +uint32_t longestAxis(const Simd4f& edgeLength) +{ + const float* e = array(edgeLength); + + if (e[0] > e[1]) + return uint32_t(e[0] > e[2] ? 0 : 2); + else + return uint32_t(e[1] > e[2] ? 1 : 2); +} +} + +template <typename Simd4f> +cloth::SwInterCollision<Simd4f>::SwInterCollision(const cloth::SwInterCollisionData* instances, uint32_t n, + float colDist, float stiffness, uint32_t iterations, + InterCollisionFilter filter, cloth::SwKernelAllocator& alloc) +: mInstances(instances) +, mNumInstances(n) +, mClothIndices(NULL) +, mParticleIndices(NULL) +, mNumParticles(0) +, mTotalParticles(0) +, mFilter(filter) +, mAllocator(alloc) +{ + NV_CLOTH_ASSERT(mFilter); + + mCollisionDistance = simd4f(colDist, colDist, colDist, 0.0f); + mCollisionSquareDistance = mCollisionDistance * mCollisionDistance; + mStiffness = simd4f(stiffness); + mNumIterations = iterations; + + // calculate particle size + for (uint32_t i = 0; i < n; ++i) + mTotalParticles += instances[i].mNumParticles; +} + +template <typename Simd4f> +cloth::SwInterCollision<Simd4f>::~SwInterCollision() +{ +} + +namespace +{ +// multiple x by m leaving w component of x intact +template <typename Simd4f> +PX_INLINE Simd4f transform(const Simd4f m[4], const Simd4f& x) +{ + const Simd4f a = m[3] + splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2]; + return select(sMaskXYZ, a, x); +} + +// rotate x by m leaving w component intact +template <typename Simd4f> +PX_INLINE Simd4f rotate(const Simd4f m[4], const Simd4f& x) +{ + const Simd4f a = splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2]; + return select(sMaskXYZ, a, x); +} + +template <typename Simd4f> +struct ClothSorter +{ + typedef cloth::BoundingBox<Simd4f> BoundingBox; + + ClothSorter(BoundingBox* bounds, uint32_t n, uint32_t axis) : mBounds(bounds), mNumBounds(n), mAxis(axis) + { + } + + bool operator()(uint32_t i, uint32_t j) const + { + NV_CLOTH_ASSERT(i < mNumBounds); + NV_CLOTH_ASSERT(j < mNumBounds); + + return array(mBounds[i].mLower)[mAxis] < array(mBounds[j].mLower)[mAxis]; + } + + BoundingBox* mBounds; + uint32_t mNumBounds; + uint32_t mAxis; +}; + +// for the given cloth array this function calculates the set of particles +// which potentially interact, the potential colliders are returned with their +// cloth index and particle index in clothIndices and particleIndices, the +// function returns the number of potential colliders +template <typename Simd4f> +uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, const cloth::SwInterCollisionData* cEnd, + const Simd4f& colDist, uint16_t* clothIndices, uint32_t* particleIndices, + cloth::BoundingBox<Simd4f>& bounds, uint32_t* overlapMasks, + cloth::InterCollisionFilter filter, cloth::SwKernelAllocator& allocator) +{ + using namespace cloth; + + typedef BoundingBox<Simd4f> BoundingBox; + + uint32_t numParticles = 0; + const uint32_t numCloths = uint32_t(cEnd - cBegin); + + // bounds of each cloth objects in world space + BoundingBox* const clothBounds = static_cast<BoundingBox*>(allocator.allocate(numCloths * sizeof(BoundingBox))); + BoundingBox* const overlapBounds = static_cast<BoundingBox*>(allocator.allocate(numCloths * sizeof(BoundingBox))); + + // union of all cloth world bounds + BoundingBox totalClothBounds = emptyBounds<Simd4f>(); + + uint32_t* sortedIndices = static_cast<uint32_t*>(allocator.allocate(numCloths * sizeof(uint32_t))); + + for (uint32_t i = 0; i < numCloths; ++i) + { + const SwInterCollisionData& c = cBegin[i]; + + // transform bounds from b local space to local space of a + PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0])); + NV_CLOTH_ASSERT(!lcBounds.isEmpty()); + PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose,lcBounds); + + BoundingBox cBounds = { simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f), + simd4f(cWorld.maximum.x, cWorld.maximum.y, cWorld.maximum.z, 0.0f) }; + + sortedIndices[i] = i; + clothBounds[i] = cBounds; + + totalClothBounds = expandBounds(totalClothBounds, cBounds); + } + + // sort indices by their minimum extent on the longest axis + const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower); + + ClothSorter<Simd4f> predicate(clothBounds, numCloths, sweepAxis); + shdfnd::sort(sortedIndices, numCloths, predicate, nv::cloth::NonTrackingAllocator()); + + for (uint32_t i = 0; i < numCloths; ++i) + { + NV_CLOTH_ASSERT(sortedIndices[i] < numCloths); + + const SwInterCollisionData& a = cBegin[sortedIndices[i]]; + + // local bounds + const Simd4f aCenter = load(reinterpret_cast<const float*>(&a.mBoundsCenter)); + const Simd4f aHalfExtent = load(reinterpret_cast<const float*>(&a.mBoundsHalfExtent)) + colDist; + const BoundingBox aBounds = { aCenter - aHalfExtent, aCenter + aHalfExtent }; + + const PxMat44 aToWorld = PxMat44(a.mGlobalPose); + const PxTransform aToLocal = a.mGlobalPose.getInverse(); + + const float axisMin = array(clothBounds[sortedIndices[i]].mLower)[sweepAxis]; + const float axisMax = array(clothBounds[sortedIndices[i]].mUpper)[sweepAxis]; + + uint32_t overlapMask = 0; + uint32_t numOverlaps = 0; + + // scan back to find first intersecting bounding box + uint32_t startIndex = i; + while (startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin) + --startIndex; + + // compute all overlapping bounds + for (uint32_t j = startIndex; j < numCloths; ++j) + { + // ignore self-collision + if (i == j) + continue; + + // early out if no more cloths along axis intersect us + if (array(clothBounds[sortedIndices[j]].mLower)[sweepAxis] > axisMax) + break; + + const SwInterCollisionData& b = cBegin[sortedIndices[j]]; + + // check if collision between these shapes is filtered + if (!filter(a.mUserData, b.mUserData)) + continue; + + // set mask bit for this cloth + overlapMask |= 1 << sortedIndices[j]; + + // transform bounds from b local space to local space of a + PxBounds3 lcBounds = PxBounds3::centerExtents(b.mBoundsCenter, b.mBoundsHalfExtent + PxVec3(array(colDist)[0])); + NV_CLOTH_ASSERT(!lcBounds.isEmpty()); + PxBounds3 bLocal = PxBounds3::transformFast(aToLocal * b.mGlobalPose,lcBounds); + + BoundingBox bBounds = { simd4f(bLocal.minimum.x, bLocal.minimum.y, bLocal.minimum.z, 0.0f), + simd4f(bLocal.maximum.x, bLocal.maximum.y, bLocal.maximum.z, 0.0f) }; + + BoundingBox iBounds = intersectBounds(aBounds, bBounds); + + // setup bounding box w to make point containment test cheaper + Simd4f floatMax = gSimd4fFloatMax & static_cast<Simd4f>(sMaskW); + iBounds.mLower = (iBounds.mLower & sMaskXYZ) | -floatMax; + iBounds.mUpper = (iBounds.mUpper & sMaskXYZ) | floatMax; + + if (!isEmptyBounds(iBounds)) + overlapBounds[numOverlaps++] = iBounds; + } + + //---------------------------------------------------------------- + // cull all particles to overlapping bounds and transform particles to world space + + const uint32_t clothIndex = sortedIndices[i]; + overlapMasks[clothIndex] = overlapMask; + + Simd4f* pBegin = reinterpret_cast<Simd4f*>(a.mParticles); + Simd4f* qBegin = reinterpret_cast<Simd4f*>(a.mPrevParticles); + + const Simd4f xform[4] = { load(reinterpret_cast<const float*>(&aToWorld.column0)), + load(reinterpret_cast<const float*>(&aToWorld.column1)), + load(reinterpret_cast<const float*>(&aToWorld.column2)), + load(reinterpret_cast<const float*>(&aToWorld.column3)) }; + + Simd4f impulseInvScale = recip(Simd4f(simd4f(cBegin[clothIndex].mImpulseScale))); + + for (uint32_t k = 0; k < a.mNumParticles; ++k) + { + Simd4f* pIt = a.mIndices ? pBegin + a.mIndices[k] : pBegin + k; + Simd4f* qIt = a.mIndices ? qBegin + a.mIndices[k] : qBegin + k; + + const Simd4f p = *pIt; + + for (const BoundingBox* oIt = overlapBounds, *oEnd = overlapBounds + numOverlaps; oIt != oEnd; ++oIt) + { + // point in box test + if (anyGreater(oIt->mLower, p) != 0) + continue; + if (anyGreater(p, oIt->mUpper) != 0) + continue; + + // transform particle to world space in-place + // (will be transformed back after collision) + *pIt = transform(xform, p); + + Simd4f impulse = (p - *qIt) * impulseInvScale; + *qIt = rotate(xform, impulse); + + // update world bounds + bounds = expandBounds(bounds, pIt, pIt + 1); + + // add particle to output arrays + clothIndices[numParticles] = uint16_t(clothIndex); + particleIndices[numParticles] = uint32_t(pIt - pBegin); + + // output each particle only once + ++numParticles; + break; + } + } + } + + allocator.deallocate(sortedIndices); + allocator.deallocate(overlapBounds); + allocator.deallocate(clothBounds); + + return numParticles; +} +} + +template <typename Simd4f> +PX_INLINE Simd4f& cloth::SwInterCollision<Simd4f>::getParticle(uint32_t index) +{ + NV_CLOTH_ASSERT(index < mNumParticles); + + uint16_t clothIndex = mClothIndices[index]; + uint32_t particleIndex = mParticleIndices[index]; + + NV_CLOTH_ASSERT(clothIndex < mNumInstances); + + return reinterpret_cast<Simd4f&>(mInstances[clothIndex].mParticles[particleIndex]); +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::operator()() +{ + mNumTests = mNumCollisions = 0; + + mClothIndices = static_cast<uint16_t*>(mAllocator.allocate(sizeof(uint16_t) * mTotalParticles)); + mParticleIndices = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t) * mTotalParticles)); + mOverlapMasks = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t*) * mNumInstances)); + + for (uint32_t k = 0; k < mNumIterations; ++k) + { + // world bounds of particles + BoundingBox<Simd4f> bounds = emptyBounds<Simd4f>(); + + // calculate potentially colliding set + { + NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::BroadPhase", /*ProfileContext::None*/ 0); + + mNumParticles = + calculatePotentialColliders(mInstances, mInstances + mNumInstances, mCollisionDistance, mClothIndices, + mParticleIndices, bounds, mOverlapMasks, mFilter, mAllocator); + } + + // collide + if (mNumParticles) + { + NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::Collide", /*ProfileContext::None*/ 0); + + Simd4f lowerBound = bounds.mLower; + Simd4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon); + + // sweep along longest axis + uint32_t sweepAxis = longestAxis(edgeLength); + uint32_t hashAxis0 = (sweepAxis + 1) % 3; + uint32_t hashAxis1 = (sweepAxis + 2) % 3; + + // reserve 0, 127, and 65535 for sentinel + Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength); + array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533; + + Simd4f one = gSimd4fOne; + Simd4f gridSize = simd4f(254.0f); + array(gridSize)[sweepAxis] = 65534.0f; + + Simd4f gridScale = recip<1>(cellSize); + Simd4f gridBias = -lowerBound * gridScale + one; + + void* buffer = mAllocator.allocate(getBufferSize(mNumParticles)); + + uint32_t* __restrict sortedIndices = reinterpret_cast<uint32_t*>(buffer); + uint32_t* __restrict sortedKeys = sortedIndices + mNumParticles; + uint32_t* __restrict keys = std::max(sortedKeys + mNumParticles, sortedIndices + 2 * mNumParticles + 1024); + + typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i; + + // create keys + for (uint32_t i = 0; i < mNumParticles; ++i) + { + // grid coordinate + Simd4f indexf = getParticle(i) * gridScale + gridBias; + + // need to clamp index because shape collision potentially + // pushes particles outside of their original bounds + Simd4i indexi = intFloor(max(one, min(indexf, gridSize))); + + const int32_t* ptr = array(indexi); + keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24)); + } + + // compute sorted keys indices + radixSort(keys, keys + mNumParticles, sortedIndices); + + // snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel) + uint32_t firstColumnSize = sortedIndices[2 * mNumParticles + 769]; + + // sort keys + for (uint32_t i = 0; i < mNumParticles; ++i) + sortedKeys[i] = keys[sortedIndices[i]]; + sortedKeys[mNumParticles] = uint32_t(-1); // sentinel + + // calculate the number of buckets we need to search forward + const Simd4i data = intFloor(gridScale * mCollisionDistance); + uint32_t collisionDistance = uint32_t(2 + array(data)[sweepAxis]); + + // collide particles + collideParticles(sortedKeys, firstColumnSize, sortedIndices, mNumParticles, collisionDistance); + + mAllocator.deallocate(buffer); + } + + /* + // verify against brute force (disable collision response when testing) + uint32_t numCollisions = mNumCollisions; + mNumCollisions = 0; + + for (uint32_t i = 0; i < mNumParticles; ++i) + for (uint32_t j = i + 1; j < mNumParticles; ++j) + if (mOverlapMasks[mClothIndices[i]] & (1 << mClothIndices[j])) + collideParticles(getParticle(i), getParticle(j)); + + static uint32_t iter = 0; ++iter; + if (numCollisions != mNumCollisions) + printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions); + */ + + // transform back to local space + { + NV_CLOTH_PROFILE_ZONE("cloth::SwInterCollision::PostTransform", /*ProfileContext::None*/ 0); + + Simd4f toLocal[4], impulseScale; + uint16_t lastCloth = uint16_t(0xffff); + + for (uint32_t i = 0; i < mNumParticles; ++i) + { + uint16_t clothIndex = mClothIndices[i]; + const SwInterCollisionData* instance = mInstances + clothIndex; + + // todo: could pre-compute these inverses + if (clothIndex != lastCloth) + { + const PxMat44 xform = PxMat44(instance->mGlobalPose.getInverse()); + + toLocal[0] = load(reinterpret_cast<const float*>(&xform.column0)); + toLocal[1] = load(reinterpret_cast<const float*>(&xform.column1)); + toLocal[2] = load(reinterpret_cast<const float*>(&xform.column2)); + toLocal[3] = load(reinterpret_cast<const float*>(&xform.column3)); + + impulseScale = simd4f(instance->mImpulseScale); + + lastCloth = mClothIndices[i]; + } + + uint32_t particleIndex = mParticleIndices[i]; + Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]); + Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]); + + particle = transform(toLocal, particle); + // avoid w becoming negative due to numerical inaccuracies + impulse = max(sZeroW, particle - rotate(toLocal, Simd4f(impulse * impulseScale))); + } + } + } + + mAllocator.deallocate(mOverlapMasks); + mAllocator.deallocate(mParticleIndices); + mAllocator.deallocate(mClothIndices); +} + +template <typename Simd4f> +size_t cloth::SwInterCollision<Simd4f>::estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n) +{ + // count total particles + uint32_t numParticles = 0; + for (uint32_t i = 0; i < n; ++i) + numParticles += cloths[i].mNumParticles; + + uint32_t boundsSize = 2 * n * sizeof(BoundingBox<Simd4f>) + n * sizeof(uint32_t); + uint32_t clothIndicesSize = numParticles * sizeof(uint16_t); + uint32_t particleIndicesSize = numParticles * sizeof(uint32_t); + uint32_t masksSize = n * sizeof(uint32_t); + + return boundsSize + clothIndicesSize + particleIndicesSize + masksSize + getBufferSize(numParticles); +} + +template <typename Simd4f> +size_t cloth::SwInterCollision<Simd4f>::getBufferSize(uint32_t numParticles) +{ + uint32_t keysSize = numParticles * sizeof(uint32_t); + uint32_t indicesSize = numParticles * sizeof(uint32_t); + uint32_t histogramSize = 1024 * sizeof(uint32_t); + + return keysSize + indicesSize + std::max(indicesSize + histogramSize, keysSize); +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::collideParticle(uint32_t index) +{ + uint16_t clothIndex = mClothIndices[index]; + + if ((1 << clothIndex) & ~mClothMask) + return; + + const SwInterCollisionData* instance = mInstances + clothIndex; + + uint32_t particleIndex = mParticleIndices[index]; + Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]); + + Simd4f diff = particle - mParticle; + Simd4f distSqr = dot3(diff, diff); + +#if PX_DEBUG + ++mNumTests; +#endif + + if (allGreater(distSqr, mCollisionSquareDistance)) + return; + + Simd4f w0 = splat<3>(mParticle); + Simd4f w1 = splat<3>(particle); + + Simd4f ratio = mCollisionDistance * rsqrt<1>(distSqr); + Simd4f scale = mStiffness * recip<1>(sEpsilon + w0 + w1); + Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ; + + mParticle = mParticle + delta * w0; + particle = particle - delta * w1; + + Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]); + + mImpulse = mImpulse + delta * w0; + impulse = impulse - delta * w1; + +#if PX_DEBUG || PX_PROFILE + ++mNumCollisions; +#endif +} + +template <typename Simd4f> +void cloth::SwInterCollision<Simd4f>::collideParticles(const uint32_t* keys, uint32_t firstColumnSize, + const uint32_t* indices, uint32_t numParticles, + uint32_t collisionDistance) +{ + const uint32_t bucketMask = uint16_t(-1); + + const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 }; + + const uint32_t* __restrict kFirst[5]; + const uint32_t* __restrict kLast[5]; + + { + // optimization: scan forward iterator starting points once instead of 9 times + const uint32_t* __restrict kIt = keys; + + uint32_t key = *kIt; + uint32_t firstKey = key - std::min(collisionDistance, key & bucketMask); + uint32_t lastKey = std::min(key + collisionDistance, key | bucketMask); + + kFirst[0] = kIt; + while (*kIt < lastKey) + ++kIt; + kLast[0] = kIt; + + for (uint32_t k = 1; k < 5; ++k) + { + for (uint32_t n = firstKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kFirst[k] = kIt; + + for (uint32_t n = lastKey + keyOffsets[k]; *kIt < n;) + ++kIt; + kLast[k] = kIt; + + // jump forward once to second column + kIt = keys + firstColumnSize; + firstColumnSize = 0; + } + } + + const uint32_t* __restrict iIt = indices; + const uint32_t* __restrict iEnd = indices + numParticles; + + const uint32_t* __restrict jIt; + const uint32_t* __restrict jEnd; + + for (; iIt != iEnd; ++iIt, ++kFirst[0]) + { + // load current particle once outside of inner loop + uint32_t index = *iIt; + NV_CLOTH_ASSERT(index < mNumParticles); + mClothIndex = mClothIndices[index]; + NV_CLOTH_ASSERT(mClothIndex < mNumInstances); + mClothMask = mOverlapMasks[mClothIndex]; + + const SwInterCollisionData* instance = mInstances + mClothIndex; + + mParticleIndex = mParticleIndices[index]; + mParticle = reinterpret_cast<const Simd4f&>(instance->mParticles[mParticleIndex]); + mImpulse = reinterpret_cast<const Simd4f&>(instance->mPrevParticles[mParticleIndex]); + + uint32_t key = *kFirst[0]; + + // range of keys we need to check against for this particle + uint32_t firstKey = key - std::min(collisionDistance, key & bucketMask); + uint32_t lastKey = std::min(key + collisionDistance, key | bucketMask); + + // scan forward end point + while (*kLast[0] < lastKey) + ++kLast[0]; + + // process potential colliders of same cell + jEnd = indices + (kLast[0] - keys); + for (jIt = iIt + 1; jIt != jEnd; ++jIt) + collideParticle(*jIt); + + // process neighbor cells + for (uint32_t k = 1; k < 5; ++k) + { + // scan forward start point + for (uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;) + ++kFirst[k]; + + // scan forward end point + for (uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;) + ++kLast[k]; + + // process potential colliders + jEnd = indices + (kLast[k] - keys); + for (jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt) + collideParticle(*jIt); + } + + // write back particle and impulse + reinterpret_cast<Simd4f&>(instance->mParticles[mParticleIndex]) = mParticle; + reinterpret_cast<Simd4f&>(instance->mPrevParticles[mParticleIndex]) = mImpulse; + } +} + +// explicit template instantiation +#if NV_SIMD_SIMD +template class cloth::SwInterCollision<Simd4f>; +#endif +#if NV_SIMD_SCALAR +template class cloth::SwInterCollision<Scalar4f>; +#endif |