diff options
| author | mtamis <[email protected]> | 2017-02-15 16:06:25 +0100 |
|---|---|---|
| committer | mtamis <[email protected]> | 2017-02-15 16:06:25 +0100 |
| commit | 85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch) | |
| tree | 45f1bb20a45a300d1fef107e436cac95602a0e57 /NvCloth/src/cuda/CuCollision.h | |
| download | nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip | |
NvCloth library v1.0.0
Diffstat (limited to 'NvCloth/src/cuda/CuCollision.h')
| -rw-r--r-- | NvCloth/src/cuda/CuCollision.h | 1572 |
1 files changed, 1572 insertions, 0 deletions
diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h new file mode 100644 index 0000000..aeb2bda --- /dev/null +++ b/NvCloth/src/cuda/CuCollision.h @@ -0,0 +1,1572 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifndef CU_SOLVER_KERNEL_CU +#error include CuCollision.h only from CuSolverKernel.cu +#endif + +#include "../IndexPair.h" + +namespace +{ +struct CuCollision +{ + struct ShapeMask + { + uint32_t mSpheres; + uint32_t mCones; + + __device__ friend ShapeMask& operator &= (ShapeMask& left, const ShapeMask& right) + { + left.mSpheres = left.mSpheres & right.mSpheres; + left.mCones = left.mCones & right.mCones; + return left; + } + }; + + struct CollisionData + { + Pointer<Shared, float> mSphereX; + Pointer<Shared, float> mSphereY; + Pointer<Shared, float> mSphereZ; + Pointer<Shared, float> mSphereW; + + Pointer<Shared, float> mConeCenterX; + Pointer<Shared, float> mConeCenterY; + Pointer<Shared, float> mConeCenterZ; + Pointer<Shared, float> mConeRadius; + Pointer<Shared, float> mConeAxisX; + Pointer<Shared, float> mConeAxisY; + Pointer<Shared, float> mConeAxisZ; + Pointer<Shared, float> mConeSlope; + Pointer<Shared, float> mConeSqrCosine; + Pointer<Shared, float> mConeHalfLength; + }; + + public: + __device__ CuCollision(Pointer<Shared, uint32_t>); + + template <typename CurrentT, typename PreviousT> + __device__ void operator()(CurrentT& current, PreviousT& previous, float alpha); + + private: + __device__ void buildSphereAcceleration(const CollisionData&); + __device__ void buildConeAcceleration(); + __device__ void mergeAcceleration(); + + template <typename CurrentT> + __device__ bool buildAcceleration(const CurrentT&, float); + + __device__ static ShapeMask readShapeMask(const float&, Pointer<Shared, const uint32_t>); + template <typename CurPos> + __device__ ShapeMask getShapeMask(const CurPos&) const; + template <typename PrevPos, typename CurPos> + __device__ ShapeMask getShapeMask(const PrevPos&, const CurPos&) const; + + template <typename CurPos> + __device__ int32_t collideCapsules(const CurPos&, float3&, float3&) const; + template <typename PrevPos, typename CurPos> + __device__ int32_t collideCapsules(const PrevPos&, CurPos&, float3&, float3&) const; + + template <typename CurrentT, typename PreviousT> + __device__ void collideCapsules(CurrentT& current, PreviousT& previous) const; + template <typename CurrentT, typename PreviousT> + __device__ void collideVirtualCapsules(CurrentT& current, PreviousT& previous) const; + template <typename CurrentT, typename PreviousT> + __device__ void collideContinuousCapsules(CurrentT& current, PreviousT& previous) const; + + template <typename CurrentT, typename PreviousT> + __device__ void collideConvexes(CurrentT& current, PreviousT& previous, float alpha); + template <typename CurPos> + __device__ int32_t collideConvexes(const CurPos&, float3&) const; + + template <typename CurrentT> + __device__ void collideTriangles(CurrentT& current, float alpha); + template <typename CurrentT> + __device__ void collideTriangles(CurrentT& current, int32_t i); + + public: + Pointer<Shared, uint32_t> mCapsuleIndices; + Pointer<Shared, uint32_t> mCapsuleMasks; + Pointer<Shared, uint32_t> mConvexMasks; + + CollisionData mPrevData; + CollisionData mCurData; + + // acceleration structure + Pointer<Shared, uint32_t> mShapeGrid; + float mGridScale[3]; + float mGridBias[3]; + static const uint32_t sGridSize = 8; +}; + +template <typename T> +__device__ void swap(T& a, T& b) +{ + T c = a; + a = b; + b = c; +} +} + +__shared__ uninitialized<CuCollision> gCollideParticles; + +namespace +{ +// initializes one pointer past data! +__device__ void allocate(CuCollision::CollisionData& data) +{ + if (threadIdx.x < 15) + { + Pointer<Shared, float>* ptr = &data.mSphereX; + ptr[threadIdx.x] = *ptr + threadIdx.x * gClothData.mNumCapsules + + min(threadIdx.x, 4) * (gClothData.mNumSpheres - gClothData.mNumCapsules); + } +} + +__device__ void generateSpheres(CuCollision::CollisionData& data, float alpha) +{ + // interpolate spheres and transpose + if (threadIdx.x < gClothData.mNumSpheres * 4) + { + float start = __ldg(gFrameData.mStartCollisionSpheres + threadIdx.x); + float target = __ldg(gFrameData.mTargetCollisionSpheres + threadIdx.x); + float value = start + (target - start) * alpha; + if (threadIdx.x % 4 == 3) + value = max(value, 0.0f); + int32_t j = threadIdx.x % 4 * gClothData.mNumSpheres + threadIdx.x / 4; + data.mSphereX[j] = value; + } + + __syncthreads(); +} + +__device__ void generateCones(CuCollision::CollisionData& data, Pointer<Shared, const uint32_t> iIt) +{ + // generate cones + if (threadIdx.x < gClothData.mNumCapsules) + { + uint32_t firstIndex = iIt[0]; + uint32_t secondIndex = iIt[1]; + + float firstX = data.mSphereX[firstIndex]; + float firstY = data.mSphereY[firstIndex]; + float firstZ = data.mSphereZ[firstIndex]; + float firstW = data.mSphereW[firstIndex]; + + float secondX = data.mSphereX[secondIndex]; + float secondY = data.mSphereY[secondIndex]; + float secondZ = data.mSphereZ[secondIndex]; + float secondW = data.mSphereW[secondIndex]; + + float axisX = (secondX - firstX) * 0.5f; + float axisY = (secondY - firstY) * 0.5f; + float axisZ = (secondZ - firstZ) * 0.5f; + float axisW = (secondW - firstW) * 0.5f; + + float sqrAxisLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + float sqrConeLength = sqrAxisLength - axisW * axisW; + + float invAxisLength = rsqrtf(sqrAxisLength); + float invConeLength = rsqrtf(sqrConeLength); + + if (sqrConeLength <= 0.0f) + invAxisLength = invConeLength = 0.0f; + + float axisLength = sqrAxisLength * invAxisLength; + + data.mConeCenterX[threadIdx.x] = (secondX + firstX) * 0.5f; + data.mConeCenterY[threadIdx.x] = (secondY + firstY) * 0.5f; + data.mConeCenterZ[threadIdx.x] = (secondZ + firstZ) * 0.5f; + data.mConeRadius[threadIdx.x] = (axisW + firstW) * invConeLength * axisLength; + + data.mConeAxisX[threadIdx.x] = axisX * invAxisLength; + data.mConeAxisY[threadIdx.x] = axisY * invAxisLength; + data.mConeAxisZ[threadIdx.x] = axisZ * invAxisLength; + data.mConeSlope[threadIdx.x] = axisW * invConeLength; + + float sine = axisW * invAxisLength; + data.mConeSqrCosine[threadIdx.x] = 1 - sine * sine; + data.mConeHalfLength[threadIdx.x] = axisLength; + } + + __syncthreads(); +} +} + +__device__ CuCollision::CuCollision(Pointer<Shared, uint32_t> scratchPtr) +{ + int32_t numCapsules2 = 2 * gClothData.mNumCapsules; + int32_t numCapsules4 = 4 * gClothData.mNumCapsules; + int32_t numConvexes = gClothData.mNumConvexes; + + if (threadIdx.x < 3) + { + (&mCapsuleIndices)[threadIdx.x] = scratchPtr + threadIdx.x * numCapsules2; + (&mShapeGrid)[-14 * int32_t(threadIdx.x)] = scratchPtr + numCapsules4 + numConvexes; + } + + Pointer<Shared, uint32_t> indexPtr = scratchPtr + threadIdx.x; + if (threadIdx.x < numCapsules2) + { + uint32_t index = (&gClothData.mCapsuleIndices->first)[threadIdx.x]; + *indexPtr = index; + + volatile uint32_t* maskPtr = generic(indexPtr + numCapsules2); + *maskPtr = 1u << index; + *maskPtr |= maskPtr[-int32_t(threadIdx.x & 1)]; + } + indexPtr += numCapsules4; + + if (threadIdx.x < numConvexes) + *indexPtr = gClothData.mConvexMasks[threadIdx.x]; + + if (gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f) + { + allocate(mPrevData); + + __syncthreads(); // mPrevData raw hazard + + generateSpheres(mPrevData, 0.0f); + generateCones(mPrevData, mCapsuleIndices + 2 * threadIdx.x); + } + + allocate(mCurData); // also initializes mShapeGrid (!) +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::operator()(CurrentT& current, PreviousT& previous, float alpha) +{ + ProfileDetailZone zone(cloth::CuProfileZoneIds::COLLIDE); + + // if (current.w > 0) current.w = previous.w (see SwSolverKernel::computeBounds()) + for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + if (current(i, 3) > 0.0f) + current(i, 3) = previous(i, 3); + } + + collideConvexes(current, previous, alpha); + collideTriangles(current, alpha); + + if (buildAcceleration(current, alpha)) + { + if (gClothData.mEnableContinuousCollision) + collideContinuousCapsules(current, previous); + else + collideCapsules(current, previous); + + collideVirtualCapsules(current, previous); + } + + // sync otherwise first threads overwrite sphere data before + // remaining ones have had a chance to use it leading to incorrect + // velocity calculation for friction / ccd + + __syncthreads(); + + if (gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f) + { + // store current collision data for next iteration + Pointer<Shared, float> dstIt = mPrevData.mSphereX + threadIdx.x; + Pointer<Shared, const float> srcIt = mCurData.mSphereX + threadIdx.x; + for (; dstIt < mCurData.mSphereX; dstIt += blockDim.x, srcIt += blockDim.x) + *dstIt = *srcIt; + } + + // __syncthreads() called in updateSleepState() +} + +// build per-axis mask arrays of spheres on the right/left of grid cell +__device__ void CuCollision::buildSphereAcceleration(const CollisionData& data) +{ + if (threadIdx.x >= 192) + return; + + int32_t sphereIdx = threadIdx.x & 31; + int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) + int32_t signi = threadIdx.x << 26 & 0x80000000; // sign bit (min or max) + + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + float pos = signf * data.mSphereW[sphereIdx] + data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; + + // use overflow so we can test for non-positive + uint32_t index = signi - uint32_t(floorf(pos * mGridScale[axisIdx] + mGridBias[axisIdx])); + + axisIdx += (uint32_t(signi) >> 31) * 3; + Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx; + // #pragma unroll + for (int32_t i = 0; i < sGridSize; ++i, ++index) + dst[i] |= __ballot(int32_t(index) <= 0); +} + +// generate cone masks from sphere masks +__device__ void CuCollision::buildConeAcceleration() +{ + if (threadIdx.x >= 192) + return; + + int32_t coneIdx = threadIdx.x & 31; + + uint32_t sphereMask = + mCurData.mConeRadius[coneIdx] && coneIdx < gClothData.mNumCapsules ? mCapsuleMasks[2 * coneIdx + 1] : 0; + + int32_t offset = threadIdx.x / 32 * sGridSize; + Pointer<Shared, uint32_t> src = mShapeGrid + offset; + Pointer<Shared, uint32_t> dst = src + 6 * sGridSize; + + // #pragma unroll + for (int32_t i = 0; i < sGridSize; ++i) + dst[i] |= __ballot(src[i] & sphereMask); +} + +// convert right/left mask arrays into single overlap array +__device__ void CuCollision::mergeAcceleration() +{ + if (threadIdx.x < sGridSize * 12) + { + Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x; + if (!(gClothData.mEnableContinuousCollision || threadIdx.x * 43 & 1024)) + *dst &= dst[sGridSize * 3]; // above is same as 'threadIdx.x/24 & 1' + + // mask garbage bits from build * Acceleration + int32_t shapeIdx = threadIdx.x >= sGridSize * 6; // spheres=0, cones=1 + *dst &= (1 << (&gClothData.mNumSpheres)[shapeIdx]) - 1; + } +} + +namespace +{ +#if __CUDA_ARCH__ >= 300 +__device__ float mergeBounds(Pointer<Shared, float> buffer) +{ + float value = *buffer; + value = max(value, __shfl_down(value, 1)); + value = max(value, __shfl_down(value, 2)); + value = max(value, __shfl_down(value, 4)); + value = max(value, __shfl_down(value, 8)); + return max(value, __shfl_down(value, 16)); +} +#else +__device__ float mergeBounds(Pointer<Shared, float> buffer) +{ + // ensure that writes to buffer are visible to all threads + __threadfence_block(); + + volatile float* ptr = generic(buffer); + *ptr = max(*ptr, ptr[16]); + *ptr = max(*ptr, ptr[8]); + *ptr = max(*ptr, ptr[4]); + *ptr = max(*ptr, ptr[2]); + return max(*ptr, ptr[1]); +} +#endif +// computes maxX, -minX, maxY, ... with a stride of 32, threadIdx.x must be < 192 +__device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer) +{ + assert(threadIdx.x < 192); + + int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index + int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) + int32_t signi = threadIdx.x << 26; // sign bit (min or max) + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + + *buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; + + return mergeBounds(buffer); +} + +#if __CUDA_ARCH__ >= 300 +template <typename CurrentT> +__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer) +{ + int32_t numThreadsPerAxis = blockDim.x * 342 >> 10 & ~31; // same as / 3 + int32_t axis = (threadIdx.x >= numThreadsPerAxis) + (threadIdx.x >= 2 * numThreadsPerAxis); + int32_t threadIdxInAxis = threadIdx.x - axis * numThreadsPerAxis; + int laneIdx = threadIdx.x & 31; + + if (threadIdxInAxis < numThreadsPerAxis) + { + typename CurrentT::ConstPointerType posIt = current[axis]; + int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1); + float minX = posIt[i], maxX = minX; + while (i += numThreadsPerAxis, i < gClothData.mNumParticles) + { + float posX = posIt[i]; + minX = min(minX, posX); + maxX = max(maxX, posX); + } + + minX = min(minX, __shfl_down(minX, 1)); + maxX = max(maxX, __shfl_down(maxX, 1)); + minX = min(minX, __shfl_down(minX, 2)); + maxX = max(maxX, __shfl_down(maxX, 2)); + minX = min(minX, __shfl_down(minX, 4)); + maxX = max(maxX, __shfl_down(maxX, 4)); + minX = min(minX, __shfl_down(minX, 8)); + maxX = max(maxX, __shfl_down(maxX, 8)); + minX = min(minX, __shfl_down(minX, 16)); + maxX = max(maxX, __shfl_down(maxX, 16)); + + if (!laneIdx) + { + Pointer<Shared, float> dst = buffer - threadIdx.x + (threadIdxInAxis >> 5) + (axis << 6); + dst[0] = maxX; + dst[32] = -minX; + } + } + + __syncthreads(); + + if (threadIdx.x >= 192) + return 0.0f; + + float value = *buffer; + if (laneIdx >= (numThreadsPerAxis >> 5)) + value = -FLT_MAX; + + // blockDim.x <= 3 * 512, increase to 3 * 1024 by adding a shfl by 16 + assert(numThreadsPerAxis <= 16 * 32); + + value = max(value, __shfl_down(value, 1)); + value = max(value, __shfl_down(value, 2)); + value = max(value, __shfl_down(value, 4)); + return max(value, __shfl_down(value, 8)); +} +#else +template <typename CurrentT> +__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer) +{ + if (threadIdx.x >= 192) + return 0.0f; + + int32_t axisIdx = threadIdx.x >> 6; // x, y, or z + int32_t signi = threadIdx.x << 26; // sign bit (min or max) + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + + typename CurrentT::ConstPointerType pIt = current[axisIdx]; + typename CurrentT::ConstPointerType pEnd = pIt + gClothData.mNumParticles; + pIt += min(threadIdx.x & 31, gClothData.mNumParticles - 1); + + *buffer = *pIt * signf; + while (pIt += 32, pIt < pEnd) + *buffer = max(*buffer, *pIt * signf); + + return mergeBounds(buffer); +} +#endif +} + +// build mask of spheres/cones touching a regular grid along each axis +template <typename CurrentT> +__device__ bool CuCollision::buildAcceleration(const CurrentT& current, float alpha) +{ + ProfileDetailZone zone(cloth::CuProfileZoneIds::COLLIDE_ACCELERATION); + + // use still unused cone data as buffer for bounds computation + Pointer<Shared, float> buffer = mCurData.mConeCenterX + threadIdx.x; + float curParticleBounds = computeParticleBounds(current, buffer); + int32_t warpIdx = threadIdx.x >> 5; + + if (!gClothData.mNumSpheres) + { + if (threadIdx.x < 192 && !(threadIdx.x & 31)) + gFrameData.mParticleBounds[warpIdx] = curParticleBounds; + return false; + } + + generateSpheres(mCurData, alpha); + + if (threadIdx.x < 192) + { + float sphereBounds = computeSphereBounds(mCurData, buffer); + float particleBounds = curParticleBounds; + if (gClothData.mEnableContinuousCollision) + { + sphereBounds = max(sphereBounds, computeSphereBounds(mPrevData, buffer)); + float prevParticleBounds = gFrameData.mParticleBounds[warpIdx]; + particleBounds = max(particleBounds, prevParticleBounds); + } + + float bounds = min(sphereBounds, particleBounds); + float expandedBounds = bounds + abs(bounds) * 1e-4f; + + // store bounds data in shared memory + if (!(threadIdx.x & 31)) + { + mGridScale[warpIdx] = expandedBounds; + gFrameData.mParticleBounds[warpIdx] = curParticleBounds; + } + } + + __syncthreads(); // mGridScale raw hazard + + if (threadIdx.x < 3) + { + float negativeLower = mGridScale[threadIdx.x * 2 + 1]; + float edgeLength = mGridScale[threadIdx.x * 2] + negativeLower; + float divisor = max(edgeLength, FLT_EPSILON); + mGridScale[threadIdx.x] = __fdividef(sGridSize - 1e-3, divisor); + mGridBias[threadIdx.x] = negativeLower * mGridScale[threadIdx.x]; + if (edgeLength < 0.0f) + mGridScale[0] = 0.0f; // mark empty intersection + } + + // initialize sphere *and* cone grid to 0 + if (threadIdx.x < 2 * 6 * sGridSize) + mShapeGrid[threadIdx.x] = 0; + + __syncthreads(); // mGridScale raw hazard + + // generate cones even if test below fails because + // continuous collision might need it in next iteration + generateCones(mCurData, mCapsuleIndices + 2 * threadIdx.x); + + if (mGridScale[0] == 0.0f) + return false; // early out for empty intersection + + if (gClothData.mEnableContinuousCollision) + buildSphereAcceleration(mPrevData); + buildSphereAcceleration(mCurData); + __syncthreads(); // mCurData raw hazard + + buildConeAcceleration(); + __syncthreads(); // mShapeGrid raw hazard + + mergeAcceleration(); + __syncthreads(); // mShapeGrid raw hazard + + return true; +} + +__device__ CuCollision::ShapeMask CuCollision::readShapeMask(const float& position, + Pointer<Shared, const uint32_t> sphereGrid) +{ + ShapeMask result; + int32_t index = int32_t(floorf(position)); + uint32_t outMask = (index < sGridSize) - 1; + + Pointer<Shared, const uint32_t> gridPtr = sphereGrid + (index & sGridSize - 1); + result.mSpheres = gridPtr[0] & ~outMask; + result.mCones = gridPtr[sGridSize * 6] & ~outMask; + + return result; +} + +// lookup acceleration structure and return mask of potential intersectors +template <typename CurPos> +__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const CurPos& positions) const +{ + ShapeMask result; + + result = readShapeMask(positions.x * mGridScale[0] + mGridBias[0], mShapeGrid); + result &= readShapeMask(positions.y * mGridScale[1] + mGridBias[1], mShapeGrid + 8); + result &= readShapeMask(positions.z * mGridScale[2] + mGridBias[2], mShapeGrid + 16); + + return result; +} + +template <typename PrevPos, typename CurPos> +__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const PrevPos& prevPos, const CurPos& curPos) const +{ + ShapeMask result; + + float prevX = prevPos.x * mGridScale[0] + mGridBias[0]; + float prevY = prevPos.y * mGridScale[1] + mGridBias[1]; + float prevZ = prevPos.z * mGridScale[2] + mGridBias[2]; + + float curX = curPos.x * mGridScale[0] + mGridBias[0]; + float curY = curPos.y * mGridScale[1] + mGridBias[1]; + float curZ = curPos.z * mGridScale[2] + mGridBias[2]; + + float maxX = min(max(prevX, curX), 7.0f); + float maxY = min(max(prevY, curY), 7.0f); + float maxZ = min(max(prevZ, curZ), 7.0f); + + result = readShapeMask(maxX, mShapeGrid); + result &= readShapeMask(maxY, mShapeGrid + 8); + result &= readShapeMask(maxZ, mShapeGrid + 16); + + float minX = max(min(prevX, curX), 0.0f); + float minY = max(min(prevY, curY), 0.0f); + float minZ = max(min(prevZ, curZ), 0.0f); + + result &= readShapeMask(minX, mShapeGrid + 24); + result &= readShapeMask(minY, mShapeGrid + 32); + result &= readShapeMask(minZ, mShapeGrid + 40); + + return result; +} + +template <typename CurPos> +__device__ int32_t CuCollision::collideCapsules(const CurPos& curPos, float3& delta, float3& velocity) const +{ + ShapeMask shapeMask = getShapeMask(curPos); + + delta.x = delta.y = delta.z = 0.0f; + velocity.x = velocity.y = velocity.z = 0.0f; + + int32_t numCollisions = 0; + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + // cone collision + for (; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1) + { + int32_t j = __ffs(shapeMask.mCones) - 1; + + float deltaX = curPos.x - mCurData.mConeCenterX[j]; + float deltaY = curPos.y - mCurData.mConeCenterY[j]; + float deltaZ = curPos.z - mCurData.mConeCenterZ[j]; + + float axisX = mCurData.mConeAxisX[j]; + float axisY = mCurData.mConeAxisY[j]; + float axisZ = mCurData.mConeAxisZ[j]; + float slope = mCurData.mConeSlope[j]; + + float dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; + float radius = max(dot * slope + mCurData.mConeRadius[j], 0.0f); + float sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot; + + Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j; + uint32_t bothMask = mIt[1]; + + if (sqrDistance > radius * radius) + { + shapeMask.mSpheres &= ~bothMask; + continue; + } + + sqrDistance = max(sqrDistance, FLT_EPSILON); + float invDistance = rsqrtf(sqrDistance); + + float base = dot + slope * sqrDistance * invDistance; + + float halfLength = mCurData.mConeHalfLength[j]; + uint32_t leftMask = base < -halfLength; + uint32_t rightMask = base > halfLength; + + uint32_t firstMask = mIt[0]; + uint32_t secondMask = firstMask ^ bothMask; + + shapeMask.mSpheres &= ~(firstMask & leftMask - 1); + shapeMask.mSpheres &= ~(secondMask & rightMask - 1); + + if (!leftMask && !rightMask) + { + deltaX = deltaX - base * axisX; + deltaY = deltaY - base * axisY; + deltaZ = deltaZ - base * axisZ; + + float sqrCosine = mCurData.mConeSqrCosine[j]; + float scale = radius * invDistance * sqrCosine - sqrCosine; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if (frictionEnabled) + { + int32_t s0 = mCapsuleIndices[2 * j]; + int32_t s1 = mCapsuleIndices[2 * j + 1]; + + // load previous sphere pos + float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0]; + float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0]; + float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0]; + + float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1]; + float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1]; + float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1]; + + // interpolate velocity between the two spheres + float t = dot * 0.5f + 0.5f; + + velocity.x += s0vx + t * (s1vx - s0vx); + velocity.y += s0vy + t * (s1vy - s0vy); + velocity.z += s0vz + t * (s1vz - s0vz); + } + + ++numCollisions; + } + } + + // sphere collision + for (; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1) + { + int32_t j = __ffs(shapeMask.mSpheres) - 1; + + float deltaX = curPos.x - mCurData.mSphereX[j]; + float deltaY = curPos.y - mCurData.mSphereY[j]; + float deltaZ = curPos.z - mCurData.mSphereZ[j]; + + float sqrDistance = FLT_EPSILON + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + float relDistance = rsqrtf(sqrDistance) * mCurData.mSphereW[j]; + + if (relDistance > 1.0f) + { + float scale = relDistance - 1.0f; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if (frictionEnabled) + { + velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j]; + velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j]; + velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j]; + } + + ++numCollisions; + } + } + + return numCollisions; +} + +static const __device__ float gSkeletonWidth = (1 - 0.2f) * (1 - 0.2f) - 1; + +template <typename PrevPos, typename CurPos> +__device__ int32_t +CuCollision::collideCapsules(const PrevPos& prevPos, CurPos& curPos, float3& delta, float3& velocity) const +{ + ShapeMask shapeMask = getShapeMask(prevPos, curPos); + + delta.x = delta.y = delta.z = 0.0f; + velocity.x = velocity.y = velocity.z = 0.0f; + + int32_t numCollisions = 0; + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + // cone collision + for (; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1) + { + int32_t j = __ffs(shapeMask.mCones) - 1; + + float prevAxisX = mPrevData.mConeAxisX[j]; + float prevAxisY = mPrevData.mConeAxisY[j]; + float prevAxisZ = mPrevData.mConeAxisZ[j]; + float prevSlope = mPrevData.mConeSlope[j]; + + float prevX = prevPos.x - mPrevData.mConeCenterX[j]; + float prevY = prevPos.y - mPrevData.mConeCenterY[j]; + float prevZ = prevPos.z - mPrevData.mConeCenterZ[j]; + float prevT = prevY * prevAxisZ - prevZ * prevAxisY; + float prevU = prevZ * prevAxisX - prevX * prevAxisZ; + float prevV = prevX * prevAxisY - prevY * prevAxisX; + float prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; + float prevRadius = max(prevDot * prevSlope + mCurData.mConeRadius[j], 0.0f); + + float curAxisX = mCurData.mConeAxisX[j]; + float curAxisY = mCurData.mConeAxisY[j]; + float curAxisZ = mCurData.mConeAxisZ[j]; + float curSlope = mCurData.mConeSlope[j]; + + float curX = curPos.x - mCurData.mConeCenterX[j]; + float curY = curPos.y - mCurData.mConeCenterY[j]; + float curZ = curPos.z - mCurData.mConeCenterZ[j]; + float curT = curY * curAxisZ - curZ * curAxisY; + float curU = curZ * curAxisX - curX * curAxisZ; + float curV = curX * curAxisY - curY * curAxisX; + float curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + float curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f); + + float curSqrDistance = FLT_EPSILON + curT * curT + curU * curU + curV * curV; + + float dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius; + float dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius; + float dotCurCur = curSqrDistance - curRadius * curRadius; + + float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + float sqrtD = sqrtf(discriminant); + float halfB = dotPrevCur - dotPrevPrev; + float minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside cone + float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA); + bool hasCollision = toi < 1.0f && halfB < sqrtD; + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + hasCollision = hasCollision && minusA < -FLT_EPSILON; + + if (hasCollision) + { + float deltaX = prevX - curX; + float deltaY = prevY - curY; + float deltaZ = prevZ - curZ; + + // interpolate delta at toi + float posX = prevX - deltaX * toi; + float posY = prevY - deltaY * toi; + float posZ = prevZ - deltaZ * toi; + + float curHalfLength = mCurData.mConeHalfLength[j]; + float curScaledAxisX = curAxisX * curHalfLength; + float curScaledAxisY = curAxisY * curHalfLength; + float curScaledAxisZ = curAxisZ * curHalfLength; + + float prevHalfLength = mPrevData.mConeHalfLength[j]; + float deltaScaledAxisX = curScaledAxisX - prevAxisX * prevHalfLength; + float deltaScaledAxisY = curScaledAxisY - prevAxisY * prevHalfLength; + float deltaScaledAxisZ = curScaledAxisZ - prevAxisZ * prevHalfLength; + + float oneMinusToi = 1.0f - toi; + + // interpolate axis at toi + float axisX = curScaledAxisX - deltaScaledAxisX * oneMinusToi; + float axisY = curScaledAxisY - deltaScaledAxisY * oneMinusToi; + float axisZ = curScaledAxisZ - deltaScaledAxisZ * oneMinusToi; + float slope = prevSlope * oneMinusToi + curSlope * toi; + + float sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + float invHalfLength = rsqrtf(sqrHalfLength); + float dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength; + + float sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot; + float invDistance = sqrDistance > 0.0f ? rsqrtf(sqrDistance) : 0.0f; + + float base = dot + slope * sqrDistance * invDistance; + float scale = base * invHalfLength; + + if (abs(scale) < 1.0f) + { + deltaX = deltaX + deltaScaledAxisX * scale; + deltaY = deltaY + deltaScaledAxisY * scale; + deltaZ = deltaZ + deltaScaledAxisZ * scale; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1 + k) with k = (tmin - toi) / (1 - toi) + float minusK = __fdividef(sqrtD, minusA * oneMinusToi); + oneMinusToi = __fdividef(oneMinusToi, 1.f - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f); + curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot; + + curPos.x = mCurData.mConeCenterX[j] + curX; + curPos.y = mCurData.mConeCenterY[j] + curY; + curPos.z = mCurData.mConeCenterZ[j] + curZ; + } + } + + // curPos inside cone (discrete collision) + bool hasContact = curRadius * curRadius > curSqrDistance; + + Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j; + uint32_t bothMask = mIt[1]; + + uint32_t cullMask = bothMask & (hasCollision | hasContact) - 1; + shapeMask.mSpheres &= ~cullMask; + + if (!hasContact) + continue; + + float invDistance = curSqrDistance > 0.0f ? rsqrtf(curSqrDistance) : 0.0f; + float base = curDot + curSlope * curSqrDistance * invDistance; + + float halfLength = mCurData.mConeHalfLength[j]; + uint32_t leftMask = base < -halfLength; + uint32_t rightMask = base > halfLength; + + // can only skip continuous sphere collision if post-ccd position + // is on code side *and* particle had cone-ccd collision. + uint32_t firstMask = mIt[0]; + uint32_t secondMask = firstMask ^ bothMask; + cullMask = (firstMask & leftMask - 1) | (secondMask & rightMask - 1); + shapeMask.mSpheres &= ~cullMask | hasCollision - 1; + + if (!leftMask && !rightMask) + { + float deltaX = curX - base * curAxisX; + float deltaY = curY - base * curAxisY; + float deltaZ = curZ - base * curAxisZ; + + float sqrCosine = mCurData.mConeSqrCosine[j]; + float scale = curRadius * invDistance * sqrCosine - sqrCosine; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if (frictionEnabled) + { + int32_t s0 = mCapsuleIndices[2 * j]; + int32_t s1 = mCapsuleIndices[2 * j + 1]; + + // load previous sphere pos + float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0]; + float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0]; + float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0]; + + float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1]; + float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1]; + float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1]; + + // interpolate velocity between the two spheres + float t = curDot * 0.5f + 0.5f; + + velocity.x += s0vx + t * (s1vx - s0vx); + velocity.y += s0vy + t * (s1vy - s0vy); + velocity.z += s0vz + t * (s1vz - s0vz); + } + + ++numCollisions; + } + } + + // sphere collision + for (; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1) + { + int32_t j = __ffs(shapeMask.mSpheres) - 1; + + float prevX = prevPos.x - mPrevData.mSphereX[j]; + float prevY = prevPos.y - mPrevData.mSphereY[j]; + float prevZ = prevPos.z - mPrevData.mSphereZ[j]; + float prevRadius = mPrevData.mSphereW[j]; + + float curX = curPos.x - mCurData.mSphereX[j]; + float curY = curPos.y - mCurData.mSphereY[j]; + float curZ = curPos.z - mCurData.mSphereZ[j]; + float curRadius = mCurData.mSphereW[j]; + + float sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ; + + float dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius; + float dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius; + float dotCurCur = sqrDistance - curRadius * curRadius; + + float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + float sqrtD = sqrtf(discriminant); + float halfB = dotPrevCur - dotPrevPrev; + float minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside sphere + float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA); + bool hasCollision = toi < 1.0f && halfB < sqrtD; + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + hasCollision = hasCollision && minusA < -FLT_EPSILON; + + if (hasCollision) + { + float deltaX = prevX - curX; + float deltaY = prevY - curY; + float deltaZ = prevZ - curZ; + + float oneMinusToi = 1.0f - toi; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1 + k) with k = (tmin - toi) / (1 - toi) + float minusK = __fdividef(sqrtD, minusA * oneMinusToi); + oneMinusToi = __fdividef(oneMinusToi, 1 - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curPos.x = mCurData.mSphereX[j] + curX; + curPos.y = mCurData.mSphereY[j] + curY; + curPos.z = mCurData.mSphereZ[j] + curZ; + + sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ; + } + + float relDistance = rsqrtf(sqrDistance) * curRadius; + + if (relDistance > 1.0f) + { + float scale = relDistance - 1.0f; + + delta.x = delta.x + curX * scale; + delta.y = delta.y + curY * scale; + delta.z = delta.z + curZ * scale; + + if (frictionEnabled) + { + velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j]; + velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j]; + velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j]; + } + + ++numCollisions; + } + } + + return numCollisions; +} + +namespace +{ +template <typename PrevPos, typename CurPos> +__device__ inline float3 calcFrictionImpulse(const PrevPos& prevPos, const CurPos& curPos, const float3& shapeVelocity, + float scale, const float3& collisionImpulse) +{ + const float frictionScale = gClothData.mFrictionScale; + + // calculate collision normal + float deltaSq = collisionImpulse.x * collisionImpulse.x + collisionImpulse.y * collisionImpulse.y + + collisionImpulse.z * collisionImpulse.z; + + float rcpDelta = rsqrtf(deltaSq + FLT_EPSILON); + + float nx = collisionImpulse.x * rcpDelta; + float ny = collisionImpulse.y * rcpDelta; + float nz = collisionImpulse.z * rcpDelta; + + // calculate relative velocity scaled by number of collision + float rvx = curPos.x - prevPos.x - shapeVelocity.x * scale; + float rvy = curPos.y - prevPos.y - shapeVelocity.y * scale; + float rvz = curPos.z - prevPos.z - shapeVelocity.z * scale; + + // calculate magnitude of relative normal velocity + float rvn = rvx * nx + rvy * ny + rvz * nz; + + // calculate relative tangential velocity + float rvtx = rvx - rvn * nx; + float rvty = rvy - rvn * ny; + float rvtz = rvz - rvn * nz; + + // calculate magnitude of vt + float rcpVt = rsqrtf(rvtx * rvtx + rvty * rvty + rvtz * rvtz + FLT_EPSILON); + + // magnitude of friction impulse (cannot be larger than -|vt|) + float j = max(-frictionScale * deltaSq * rcpDelta * scale * rcpVt, -1.0f); + + return make_float3(rvtx * j, rvty * j, rvtz * j); +} +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideCapsules(CurrentT& current, PreviousT& previous) const +{ + ProfileDetailZone zone(cloth::CuProfileZoneIds::COLLIDE_CAPSULES); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename CurrentT::VectorType curPos = current(i); + + float3 delta, velocity; + if (int32_t numCollisions = collideCapsules(curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + + if (frictionEnabled) + { + typename PreviousT::VectorType prevPos = previous(i); + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + + if (massScaleEnabled) + { + float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z; + float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq; + current(i, 3) = __fdividef(current(i, 3), massScale); + } + } + } +} + +#define NEW_LERP_AND_APPLY 1 +namespace +{ +#if NEW_LERP_AND_APPLY +template <typename ParticleDataT> +__device__ float3 lerp(const ParticleDataT& particleData, const int4& indices, const float4& weights) +{ + typename ParticleDataT::VectorType posX, posY, posZ; + posX = particleData(indices.x); + posY = particleData(indices.y); + posZ = particleData(indices.z); + + float3 result; + result.x = posX.x * weights.x + posY.x * weights.y + posZ.x * weights.z; + result.y = posX.y * weights.x + posY.y * weights.y + posZ.y * weights.z; + result.z = posX.z * weights.x + posY.z * weights.y + posZ.z * weights.z; + return result; +} + +template <typename ParticleDataT> +__device__ void apply(ParticleDataT& particleData, const int4& indices, const float4& weights, float3 delta, float scale) +{ + typename ParticleDataT::VectorType posX, posY, posZ; + posX = particleData(indices.x); + posY = particleData(indices.y); + posZ = particleData(indices.z); + + delta.x *= scale; + delta.y *= scale; + delta.z *= scale; + + posX.x += delta.x * weights.x; posY.x += delta.x * weights.y; posZ.x += delta.x * weights.z; + posX.y += delta.y * weights.x; posY.y += delta.y * weights.y; posZ.y += delta.y * weights.z; + posX.z += delta.z * weights.x; posY.z += delta.z * weights.y; posZ.z += delta.z * weights.z; + + particleData(indices.x) = posX; + particleData(indices.y) = posY; + particleData(indices.z) = posZ; +} +#else +template <typename PointerT> +__device__ float lerp(PointerT pos, const int4& indices, const float4& weights) +{ + return pos[indices.x] * weights.x + pos[indices.y] * weights.y + pos[indices.z] * weights.z; +} + +template <typename PointerT> +__device__ void apply(PointerT pos, const int4& indices, const float4& weights, float delta) +{ + pos[indices.x] += delta * weights.x; + pos[indices.y] += delta * weights.y; + pos[indices.z] += delta * weights.z; +} +#endif +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideVirtualCapsules(CurrentT& current, PreviousT& previous) const +{ + const uint32_t* __restrict setSizeIt = gClothData.mVirtualParticleSetSizesBegin; + + if (!setSizeIt) + return; + + if (gClothData.mEnableContinuousCollision) + { + // copied from mergeAcceleration + Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x; + if (!(threadIdx.x * 43 & 1024) && threadIdx.x < sGridSize * 12) + *dst &= dst[sGridSize * 3]; + __syncthreads(); // mShapeGrid raw hazard + } + + ProfileDetailZone zone(cloth::CuProfileZoneIds::COLLIDE_VIRTUAL_CAPSULES); + + const uint32_t* __restrict setSizeEnd = gClothData.mVirtualParticleSetSizesEnd; + const uint16_t* __restrict indicesEnd = gClothData.mVirtualParticleIndices; + const float4* __restrict weightsIt = reinterpret_cast<const float4*>(gClothData.mVirtualParticleWeights); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for (; setSizeIt != setSizeEnd; ++setSizeIt) + { + __syncthreads(); + + const uint16_t* __restrict indicesIt = indicesEnd + threadIdx.x * 4; + for (indicesEnd += *setSizeIt * 4; indicesIt < indicesEnd; indicesIt += blockDim.x * 4) + { + int4 indices = make_int4(indicesIt[0], indicesIt[1], indicesIt[2], indicesIt[3]); + + float4 weights = weightsIt[indices.w]; + +#if NEW_LERP_AND_APPLY + float3 curPos = lerp(current, indices, weights); +#else + float3 curPos; + curPos.x = lerp(current[0], indices, weights); + curPos.y = lerp(current[1], indices, weights); + curPos.z = lerp(current[2], indices, weights); + +#endif + float3 delta, velocity; + if (int32_t numCollisions = collideCapsules(curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + float wscale = weights.w * scale; + +#if NEW_LERP_AND_APPLY + apply(current, indices, weights, delta, wscale); +#else + apply(current[0], indices, weights, delta.x * wscale); + apply(current[1], indices, weights, delta.y * wscale); + apply(current[2], indices, weights, delta.z * wscale); +#endif + if (frictionEnabled) + { +#if NEW_LERP_AND_APPLY + float3 prevPos = lerp(previous, indices, weights); +#else + float3 prevPos; + prevPos.x = lerp(previous[0], indices, weights); + prevPos.y = lerp(previous[1], indices, weights); + prevPos.z = lerp(previous[2], indices, weights); +#endif + + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + +#if NEW_LERP_AND_APPLY + apply(previous, indices, weights, frictionImpulse, -weights.w); +#else + apply(previous[0], indices, weights, frictionImpulse.x * -weights.w); + apply(previous[1], indices, weights, frictionImpulse.y * -weights.w); + apply(previous[2], indices, weights, frictionImpulse.z * -weights.w); +#endif + } + + if (massScaleEnabled) + { + float deltaLengthSq = (delta.x * delta.x + delta.y * delta.y + delta.z * delta.z) * scale * scale; + float invMassScale = __fdividef(1.0f, 1.0f + gClothData.mCollisionMassScale * deltaLengthSq); + + // not multiplying by weights[3] here because unlike applying velocity + // deltas where we want the interpolated position to obtain a particular + // value, we instead just require that the total change is equal to invMassScale + invMassScale = invMassScale - 1.0f; + current(indices.x, 3) *= 1.0f + weights.x * invMassScale; + current(indices.y, 3) *= 1.0f + weights.y * invMassScale; + current(indices.z, 3) *= 1.0f + weights.z * invMassScale; + } + } + } + } +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideContinuousCapsules(CurrentT& current, PreviousT& previous) const +{ + ProfileDetailZone zone(cloth::CuProfileZoneIds::COLLIDE_CONTINUOUS_CAPSULES); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename PreviousT::VectorType prevPos = previous(i); + typename CurrentT::VectorType curPos = current(i); + + float3 delta, velocity; + if (int32_t numCollisions = collideCapsules(prevPos, curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + + if (frictionEnabled) + { + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + + if (massScaleEnabled) + { + float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z; + float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq; + current(i, 3) = __fdividef(current(i, 3), massScale); + } + } + } +} + +template <typename CurPos> +__device__ int32_t CuCollision::collideConvexes(const CurPos& positions, float3& delta) const +{ + ProfileDetailZone zone(cloth::CuProfileZoneIds::COLLIDE_CONVEXES); + + delta.x = delta.y = delta.z = 0.0f; + + Pointer<Shared, const float> planeX = mCurData.mSphereX; + Pointer<Shared, const float> planeY = planeX + gClothData.mNumPlanes; + Pointer<Shared, const float> planeZ = planeY + gClothData.mNumPlanes; + Pointer<Shared, const float> planeW = planeZ + gClothData.mNumPlanes; + + int32_t numCollisions = 0; + Pointer<Shared, const uint32_t> cIt = mConvexMasks; + Pointer<Shared, const uint32_t> cEnd = cIt + gClothData.mNumConvexes; + for (; cIt != cEnd; ++cIt) + { + uint32_t mask = *cIt; + + int32_t maxIndex = __ffs(mask) - 1; + float maxDist = planeW[maxIndex] + positions.z * planeZ[maxIndex] + positions.y * planeY[maxIndex] + + positions.x * planeX[maxIndex]; + + while ((maxDist < 0.0f) && (mask &= mask - 1)) + { + int32_t i = __ffs(mask) - 1; + float dist = planeW[i] + positions.z * planeZ[i] + positions.y * planeY[i] + positions.x * planeX[i]; + if (dist > maxDist) + maxDist = dist, maxIndex = i; + } + + if (maxDist < 0.0f) + { + delta.x -= planeX[maxIndex] * maxDist; + delta.y -= planeY[maxIndex] * maxDist; + delta.z -= planeZ[maxIndex] * maxDist; + + ++numCollisions; + } + } + + return numCollisions; +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideConvexes(CurrentT& current, PreviousT& previous, float alpha) +{ + if (!gClothData.mNumConvexes) + return; + + // interpolate planes and transpose + if (threadIdx.x < gClothData.mNumPlanes * 4) + { + float start = gFrameData.mStartCollisionPlanes[threadIdx.x]; + float target = gFrameData.mTargetCollisionPlanes[threadIdx.x]; + int32_t j = threadIdx.x % 4 * gClothData.mNumPlanes + threadIdx.x / 4; + mCurData.mSphereX[j] = start + (target - start) * alpha; + } + + __syncthreads(); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename CurrentT::VectorType curPos = current(i); + + float3 delta; + if (int32_t numCollisions = collideConvexes(curPos, delta)) + { + float scale = __fdividef(1.0f, numCollisions); + + if (frictionEnabled) + { + typename PreviousT::VectorType prevPos = previous(i); + + float3 frictionImpulse = + calcFrictionImpulse(prevPos, curPos, make_float3(0.0f, 0.0f, 0.0f), scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + } + } + + __syncthreads(); +} + +namespace +{ +struct TriangleData +{ + float baseX, baseY, baseZ; + float edge0X, edge0Y, edge0Z; + float edge1X, edge1Y, edge1Z; + float normalX, normalY, normalZ; + + float edge0DotEdge1; + float edge0SqrLength; + float edge1SqrLength; + + float det; + float denom; + + float edge0InvSqrLength; + float edge1InvSqrLength; + + // initialize struct after vertices have been stored in first 9 members + __device__ void initialize() + { + edge0X -= baseX, edge0Y -= baseY, edge0Z -= baseZ; + edge1X -= baseX, edge1Y -= baseY, edge1Z -= baseZ; + + normalX = edge0Y * edge1Z - edge0Z * edge1Y; + normalY = edge0Z * edge1X - edge0X * edge1Z; + normalZ = edge0X * edge1Y - edge0Y * edge1X; + + float normalInvLength = rsqrtf(normalX * normalX + normalY * normalY + normalZ * normalZ); + normalX *= normalInvLength; + normalY *= normalInvLength; + normalZ *= normalInvLength; + + edge0DotEdge1 = edge0X * edge1X + edge0Y * edge1Y + edge0Z * edge1Z; + edge0SqrLength = edge0X * edge0X + edge0Y * edge0Y + edge0Z * edge0Z; + edge1SqrLength = edge1X * edge1X + edge1Y * edge1Y + edge1Z * edge1Z; + + det = __fdividef(1.0f, edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1); + denom = __fdividef(1.0f, edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1); + + edge0InvSqrLength = __fdividef(1.0f, edge0SqrLength); + edge1InvSqrLength = __fdividef(1.0f, edge1SqrLength); + } +}; +} + +template <typename CurrentT> +__device__ void CuCollision::collideTriangles(CurrentT& current, int32_t i) +{ + ProfileDetailZone zone(cloth::CuProfileZoneIds::COLLIDE_TRIANGLES); + + float posX = current(i, 0); + float posY = current(i, 1); + float posZ = current(i, 2); + + const TriangleData* __restrict tIt = reinterpret_cast<const TriangleData*>(generic(mCurData.mSphereX)); + const TriangleData* __restrict tEnd = tIt + gClothData.mNumCollisionTriangles; + + float normalX, normalY, normalZ, normalD = 0.0f; + float minSqrLength = FLT_MAX; + + for (; tIt != tEnd; ++tIt) + { + float dx = posX - tIt->baseX; + float dy = posY - tIt->baseY; + float dz = posZ - tIt->baseZ; + + float deltaDotEdge0 = dx * tIt->edge0X + dy * tIt->edge0Y + dz * tIt->edge0Z; + float deltaDotEdge1 = dx * tIt->edge1X + dy * tIt->edge1Y + dz * tIt->edge1Z; + float deltaDotNormal = dx * tIt->normalX + dy * tIt->normalY + dz * tIt->normalZ; + + float s = tIt->edge1SqrLength * deltaDotEdge0 - tIt->edge0DotEdge1 * deltaDotEdge1; + float t = tIt->edge0SqrLength * deltaDotEdge1 - tIt->edge0DotEdge1 * deltaDotEdge0; + + s = t > 0.0f ? s * tIt->det : deltaDotEdge0 * tIt->edge0InvSqrLength; + t = s > 0.0f ? t * tIt->det : deltaDotEdge1 * tIt->edge1InvSqrLength; + + if (s + t > 1.0f) + { + s = (tIt->edge1SqrLength - tIt->edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1) * tIt->denom; + } + + s = fmaxf(0.0f, fminf(1.0f, s)); + t = fmaxf(0.0f, fminf(1.0f - s, t)); + + dx = dx - tIt->edge0X * s - tIt->edge1X * t; + dy = dy - tIt->edge0Y * s - tIt->edge1Y * t; + dz = dz - tIt->edge0Z * s - tIt->edge1Z * t; + + float sqrLength = dx * dx + dy * dy + dz * dz; + + if (0.0f > deltaDotNormal) + sqrLength *= 1.0001f; + + if (sqrLength < minSqrLength) + { + normalX = tIt->normalX; + normalY = tIt->normalY; + normalZ = tIt->normalZ; + normalD = deltaDotNormal; + minSqrLength = sqrLength; + } + } + + if (normalD < 0.0f) + { + current(i, 0) = posX - normalX * normalD; + current(i, 1) = posY - normalY * normalD; + current(i, 2) = posZ - normalZ * normalD; + } +} + +namespace +{ +static const int32_t sTrianglePadding = sizeof(TriangleData) / sizeof(float) - 9; +} + +template <typename CurrentT> +__device__ void CuCollision::collideTriangles(CurrentT& current, float alpha) +{ + if (!gClothData.mNumCollisionTriangles) + return; + + // interpolate triangle vertices and store in shared memory + for (int32_t i = threadIdx.x, n = gClothData.mNumCollisionTriangles * 9; i < n; i += blockDim.x) + { + float start = gFrameData.mStartCollisionTriangles[i]; + float target = gFrameData.mTargetCollisionTriangles[i]; + int32_t idx = i * 7282 >> 16; // same as i/9 + int32_t offset = i + idx * sTrianglePadding; + mCurData.mSphereX[offset] = start + (target - start) * alpha; + } + + __syncthreads(); + + for (int32_t i = threadIdx.x; i < gClothData.mNumCollisionTriangles; i += blockDim.x) + { + reinterpret_cast<TriangleData*>(generic(mCurData.mSphereX))[i].initialize(); + } + + __syncthreads(); + + for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + collideTriangles(current, i); + + __syncthreads(); +} |