aboutsummaryrefslogtreecommitdiff
path: root/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
diff options
context:
space:
mode:
authorgit perforce import user <a@b>2016-10-25 12:29:14 -0600
committerSheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>2016-10-25 18:56:37 -0500
commit3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
treefa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
downloadphysx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
Initial commit:
PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h')
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h472
1 files changed, 472 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
new file mode 100644
index 00000000..fb0fd7af
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h
@@ -0,0 +1,472 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifndef CU_SOLVER_KERNEL_CU
+#error include CuSelfCollision.h only from CuSolverKernel.cu
+#endif
+
+#ifndef UINT16_MAX
+#define UINT16_MAX 0xffff
+#endif
+
+namespace
+{
+#if __CUDA_ARCH__ >= 300
+template <int>
+__device__ void scanWarp(Pointer<Shared, int32_t> counts)
+{
+ asm volatile("{"
+ " .reg .s32 tmp;"
+ " .reg .pred p;"
+ " shfl.up.b32 tmp|p, %0, 0x01, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x02, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x04, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x08, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ " shfl.up.b32 tmp|p, %0, 0x10, 0x0;"
+ "@p add.s32 %0, tmp, %0;"
+ "}"
+ : "+r"(*generic(counts))
+ :);
+}
+#else
+template <int stride>
+__device__ void scanWarp(Pointer<Shared, int32_t> counts)
+{
+ volatile int32_t* ptr = generic(counts);
+ const int32_t laneIdx = threadIdx.x & warpSize - 1;
+ if(laneIdx >= 1)
+ *ptr += ptr[-stride];
+ if(laneIdx >= 2)
+ *ptr += ptr[-2 * stride];
+ if(laneIdx >= 4)
+ *ptr += ptr[-4 * stride];
+ if(laneIdx >= 8)
+ *ptr += ptr[-8 * stride];
+ if(laneIdx >= 16)
+ *ptr += ptr[-16 * stride];
+}
+#endif
+
+// sorts array by upper 16bits
+// [keys] must be at least 2*n in length, in/out in first n elements
+// [histogram] must be at least 34*16 = 544 in length
+__device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> histogram)
+{
+ const int32_t numWarps = blockDim.x >> 5;
+ const int32_t warpIdx = threadIdx.x >> 5;
+ const int32_t laneIdx = threadIdx.x & warpSize - 1;
+
+ const uint32_t laneMask = (1u << laneIdx) - 1;
+ const uint32_t mask1 = (threadIdx.x & 1) - 1;
+ const uint32_t mask2 = !!(threadIdx.x & 2) - 1;
+ const uint32_t mask4 = !!(threadIdx.x & 4) - 1;
+ const uint32_t mask8 = !!(threadIdx.x & 8) - 1;
+
+ const int32_t tn = (n + blockDim.x - 1) / blockDim.x;
+ const int32_t startIndex = tn * (threadIdx.x - laneIdx) + laneIdx;
+ const int32_t endIndex = min(startIndex + tn * warpSize, n + 31 & ~31); // full warps for ballot
+
+ int32_t* srcKeys = keys;
+ int32_t* dstKeys = keys + n;
+
+ Pointer<Shared, int32_t> hIt = histogram + 16 * warpIdx;
+ Pointer<Shared, int32_t> pIt = histogram + 16 * laneIdx + 16;
+ Pointer<Shared, int32_t> tIt = histogram + 16 * numWarps + laneIdx;
+
+ for(int32_t p = 16; p < 32; p += 4) // radix passes (4 bits each)
+ {
+ // gather bucket histograms per warp
+ int32_t warpCount = 0;
+ for(int32_t i = startIndex; i < endIndex; i += 32)
+ {
+ int32_t key = i < n ? srcKeys[i] >> p : 15;
+ uint32_t ballot1 = __ballot(key & 1);
+ uint32_t ballot2 = __ballot(key & 2);
+ uint32_t ballot4 = __ballot(key & 4);
+ uint32_t ballot8 = __ballot(key & 8);
+ warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
+ }
+
+ if(laneIdx >= 16)
+ hIt[laneIdx] = warpCount;
+
+ __syncthreads();
+
+ // prefix sum of histogram buckets
+ for(int32_t i = warpIdx; i < 16; i += numWarps)
+ scanWarp<16>(pIt + i);
+
+ __syncthreads();
+
+ // prefix sum of bucket totals (exclusive)
+ if(threadIdx.x < 16)
+ {
+ *tIt = tIt[-1] & !threadIdx.x - 1;
+ scanWarp<1>(tIt);
+ hIt[threadIdx.x] = 0;
+ }
+
+ __syncthreads();
+
+ if(laneIdx < 16)
+ hIt[laneIdx] += *tIt;
+
+ // split indices
+ for(int32_t i = startIndex; i < endIndex; i += 32)
+ {
+ int32_t key = i < n ? srcKeys[i] >> p : 15;
+ uint32_t ballot1 = __ballot(key & 1);
+ uint32_t ballot2 = __ballot(key & 2);
+ uint32_t ballot4 = __ballot(key & 4);
+ uint32_t ballot8 = __ballot(key & 8);
+ uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) &
+ (!!(key & 8) - 1 ^ ballot8);
+ int32_t index = hIt[key & 15] + __popc(bits & laneMask);
+
+ if(i < n)
+ dstKeys[index] = srcKeys[i];
+
+ if(laneIdx < 16)
+ hIt[laneIdx] += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
+ }
+
+ __syncthreads();
+
+ ::swap(srcKeys, dstKeys);
+ }
+
+#ifndef NDEBUG
+ for(int32_t i = threadIdx.x; i < n; i += blockDim.x)
+ assert(!i || keys[i - 1] >> 16 <= keys[i] >> 16);
+#endif
+}
+}
+
+namespace
+{
+struct CuSelfCollision
+{
+ template <typename CurrentT>
+ __device__ void operator()(CurrentT& current);
+
+ private:
+ template <typename CurrentT>
+ __device__ void buildAcceleration(const CurrentT& current);
+ template <bool useRestPositions, typename CurrentT>
+ __device__ void collideParticles(CurrentT& current) const;
+
+ public:
+ float mPosBias[3];
+ float mPosScale[3];
+ const float* mPosPtr[3];
+};
+}
+
+__shared__ uninitialized<CuSelfCollision> gSelfCollideParticles;
+
+template <typename CurrentT>
+__device__ void CuSelfCollision::operator()(CurrentT& current)
+{
+ if(min(gClothData.mSelfCollisionDistance, gFrameData.mSelfCollisionStiffness) <= 0.0f)
+ return;
+
+ if(threadIdx.x < 3)
+ {
+ float upper = gFrameData.mParticleBounds[threadIdx.x * 2];
+ float negativeLower = gFrameData.mParticleBounds[threadIdx.x * 2 + 1];
+
+ // expand bounds
+ float eps = (upper + negativeLower) * 1e-4f;
+ float expandedUpper = upper + eps;
+ float expandedNegativeLower = negativeLower + eps;
+ float expandedEdgeLength = expandedUpper + expandedNegativeLower;
+
+ float* edgeLength = mPosBias; // use as temp
+ edgeLength[threadIdx.x] = expandedEdgeLength;
+
+ __threadfence_block();
+
+ // calculate shortest axis
+ int32_t shortestAxis = edgeLength[0] > edgeLength[1];
+ if(edgeLength[shortestAxis] > edgeLength[2])
+ shortestAxis = 2;
+
+ uint32_t writeAxis = threadIdx.x - shortestAxis;
+ writeAxis += writeAxis >> 30;
+
+ float maxInvCellSize = __fdividef(127.0f, expandedEdgeLength);
+ float invCollisionDistance = __fdividef(1.0f, gClothData.mSelfCollisionDistance);
+ float invCellSize = min(maxInvCellSize, invCollisionDistance);
+
+ mPosScale[writeAxis] = invCellSize;
+ mPosBias[writeAxis] = invCellSize * expandedNegativeLower;
+ mPosPtr[writeAxis] = generic(current[threadIdx.x]);
+ }
+
+ __syncthreads();
+
+ buildAcceleration(current);
+
+ if(gFrameData.mRestPositions)
+ collideParticles<true>(current);
+ else
+ collideParticles<false>(current);
+}
+
+template <typename CurrentT>
+__device__ void CuSelfCollision::buildAcceleration(const CurrentT& current)
+{
+ int32_t numIndices = gClothData.mNumSelfCollisionIndices;
+ const int32_t* indices = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionIndices);
+ int32_t* sortedKeys = reinterpret_cast<int32_t*>(gClothData.mSelfCollisionKeys);
+ int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart);
+
+ typedef typename CurrentT::ConstPointerType ConstPointerType;
+ ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]);
+ ConstPointerType colPtr = ConstPointerType(mPosPtr[2]);
+
+ float rowScale = mPosScale[1], rowBias = mPosBias[1];
+ float colScale = mPosScale[2], colBias = mPosBias[2];
+
+ // calculate keys
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t index = indices ? indices[i] : i;
+ assert(index < gClothData.mNumParticles);
+
+ int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[index] * rowScale + rowBias, 127.5f)));
+ int32_t colIndex = int32_t(max(0.0f, min(colPtr[index] * colScale + colBias, 127.5f)));
+ assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128);
+
+ int32_t key = (colIndex << 7 | rowIndex) + 129; // + row and column sentinel
+ assert(key <= 0x4080);
+
+ sortedKeys[i] = key << 16 | index; // (key, index) pair in a single int32_t
+ }
+ __syncthreads();
+
+ // get scratch shared mem buffer used for radix sort(histogram)
+ Pointer<Shared, int32_t> buffer =
+ reinterpret_cast<Pointer<Shared, int32_t> const&>(gCollideParticles.get().mCurData.mSphereX);
+
+ // sort keys (__synchthreads inside radix sort)
+ radixSort(sortedKeys, numIndices, buffer);
+
+ // mark cell start if keys are different between neighboring threads
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t key = sortedKeys[i] >> 16;
+ int32_t prevKey = i ? sortedKeys[i - 1] >> 16 : key - 1;
+ if(key != prevKey)
+ {
+ cellStart[key] = i;
+ cellStart[prevKey + 1] = i;
+ }
+ }
+ __syncthreads();
+}
+
+template <bool useRestPositions, typename CurrentT>
+__device__ void CuSelfCollision::collideParticles(CurrentT& current) const
+{
+ const int32_t* sortedKeys = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionKeys);
+ float* sortedParticles = gClothData.mSelfCollisionParticles;
+ int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart);
+
+ const float cdist = gClothData.mSelfCollisionDistance;
+ const float cdistSq = cdist * cdist;
+
+ const int32_t numIndices = gClothData.mNumSelfCollisionIndices;
+ const int32_t numParticles = gClothData.mNumParticles;
+
+ // point to particle copied in device memory that is being updated
+ float* xPtr = sortedParticles;
+ float* yPtr = sortedParticles + numParticles;
+ float* zPtr = sortedParticles + 2 * numParticles;
+ float* wPtr = sortedParticles + 3 * numParticles;
+
+ // copy current particles to temporary array
+ for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x)
+ {
+ xPtr[i] = current(i, 0);
+ yPtr[i] = current(i, 1);
+ zPtr[i] = current(i, 2);
+ wPtr[i] = current(i, 3);
+ }
+ __syncthreads();
+
+ // copy only sorted (indexed) particles to shared mem
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t index = sortedKeys[i] & UINT16_MAX;
+ current(i, 0) = xPtr[index];
+ current(i, 1) = yPtr[index];
+ current(i, 2) = zPtr[index];
+ current(i, 3) = wPtr[index];
+ }
+ __syncthreads();
+
+ typedef typename CurrentT::ConstPointerType ConstPointerType;
+ ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]);
+ ConstPointerType colPtr = ConstPointerType(mPosPtr[2]);
+
+ float rowScale = mPosScale[1], rowBias = mPosBias[1];
+ float colScale = mPosScale[2], colBias = mPosBias[2];
+
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ const int32_t index = sortedKeys[i] & UINT16_MAX;
+ assert(index < gClothData.mNumParticles);
+
+ float restX, restY, restZ;
+ if(useRestPositions)
+ {
+ const float* restIt = gFrameData.mRestPositions + index * 4;
+ restX = restIt[0];
+ restY = restIt[1];
+ restZ = restIt[2];
+ }
+
+ float posX = current(i, 0);
+ float posY = current(i, 1);
+ float posZ = current(i, 2);
+ float posW = current(i, 3);
+
+ float deltaX = 0.0f;
+ float deltaY = 0.0f;
+ float deltaZ = 0.0f;
+ float deltaW = FLT_EPSILON;
+
+ // get cell index for this particle
+ int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[i] * rowScale + rowBias, 127.5f)));
+ int32_t colIndex = int32_t(max(0.0f, min(colPtr[i] * colScale + colBias, 127.5f)));
+ assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128);
+
+ int32_t key = colIndex << 7 | rowIndex;
+ assert(key <= 0x4080);
+
+ // check cells in 3 columns
+ for(int32_t keyEnd = key + 256; key <= keyEnd; key += 128)
+ {
+ // cellStart keys of unoccupied cells have a value of -1
+ uint32_t startIndex; // min<unsigned>(cellStart[key+0..2])
+ uint32_t endIndex; // max<signed>(0, cellStart[key+1..3])
+
+ asm volatile("{\n\t"
+ " .reg .u32 start1, start2;\n\t"
+ " ld.global.s16 %1, [%2+6];\n\t"
+ " ld.global.s16 %0, [%2+0];\n\t"
+ " ld.global.s16 start1, [%2+2];\n\t"
+ " ld.global.s16 start2, [%2+4];\n\t"
+ " max.s32 %1, %1, 0;\n\t"
+ " min.u32 %0, %0, start1;\n\t"
+ " max.s32 %1, %1, start1;\n\t"
+ " min.u32 %0, %0, start2;\n\t"
+ " max.s32 %1, %1, start2;\n\t"
+ "}\n\t"
+ : "=r"(startIndex), "=r"(endIndex)
+ : POINTER_CONSTRAINT(cellStart + key));
+
+ // comparison must be unsigned to skip cells with negative startIndex
+ for(uint32_t j = startIndex; j < endIndex; ++j)
+ {
+ if(j != i) // avoid same particle
+ {
+ float dx = posX - current(j, 0);
+ float dy = posY - current(j, 1);
+ float dz = posZ - current(j, 2);
+
+ float distSqr = dx * dx + dy * dy + dz * dz;
+ if(distSqr > cdistSq)
+ continue;
+
+ if(useRestPositions)
+ {
+ const int32_t jndex = sortedKeys[j] & UINT16_MAX;
+ assert(jndex < gClothData.mNumParticles);
+
+ // calculate distance in rest configuration
+ const float* restJt = gFrameData.mRestPositions + jndex * 4;
+ float rx = restX - restJt[0];
+ float ry = restY - restJt[1];
+ float rz = restZ - restJt[2];
+
+ if(rx * rx + ry * ry + rz * rz <= cdistSq)
+ continue;
+ }
+
+ // premultiply ratio for weighted average
+ float ratio = fmaxf(0.0f, cdist * rsqrtf(FLT_EPSILON + distSqr) - 1.0f);
+ float scale = __fdividef(ratio * ratio, FLT_EPSILON + posW + current(j, 3));
+
+ deltaX += scale * dx;
+ deltaY += scale * dy;
+ deltaZ += scale * dz;
+ deltaW += ratio;
+ }
+ }
+ }
+
+ const float stiffness = gFrameData.mSelfCollisionStiffness * posW;
+ float scale = __fdividef(stiffness, deltaW);
+
+ // apply collision impulse
+ xPtr[index] += deltaX * scale;
+ yPtr[index] += deltaY * scale;
+ zPtr[index] += deltaZ * scale;
+
+ assert(!isnan(xPtr[index] + yPtr[index] + zPtr[index]));
+ }
+ __syncthreads();
+
+ // copy temporary particle array back to shared mem
+ // (need to copy whole array)
+ for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x)
+ {
+ current(i, 0) = xPtr[i];
+ current(i, 1) = yPtr[i];
+ current(i, 2) = zPtr[i];
+ current(i, 3) = wPtr[i];
+ }
+
+ // unmark occupied cells to empty again (faster than clearing all the cells)
+ for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x)
+ {
+ int32_t key = sortedKeys[i] >> 16;
+ cellStart[key] = 0xffff;
+ cellStart[key + 1] = 0xffff;
+ }
+ __syncthreads();
+}