diff options
| author | Marijn Tamis <[email protected]> | 2019-04-01 14:21:09 +0200 |
|---|---|---|
| committer | Marijn Tamis <[email protected]> | 2019-04-01 14:21:09 +0200 |
| commit | d243404d4ba88bcf53f7310cc8980b4efe38c19f (patch) | |
| tree | dcc8ce2904e9f813e03f71f825c4d3c9ec565d91 /NvCloth/src/cuda | |
| parent | Add new SetSpheres and SetPlanes api's to bring them in line with setTriangles. (diff) | |
| download | nvcloth-1.1.6.tar.xz nvcloth-1.1.6.zip | |
1.1.6 Release.1.1.6
Diffstat (limited to 'NvCloth/src/cuda')
| -rw-r--r-- | NvCloth/src/cuda/CuCollision.h | 58 | ||||
| -rw-r--r-- | NvCloth/src/cuda/CuDeviceVector.h | 4 | ||||
| -rw-r--r-- | NvCloth/src/cuda/CuFabric.cpp | 2 | ||||
| -rw-r--r-- | NvCloth/src/cuda/CuFactory.h | 2 | ||||
| -rw-r--r-- | NvCloth/src/cuda/CuSelfCollision.h | 22 | ||||
| -rw-r--r-- | NvCloth/src/cuda/CuSolver.cpp | 7 | ||||
| -rw-r--r-- | NvCloth/src/cuda/CuSolverKernel.cu | 3 |
7 files changed, 52 insertions, 46 deletions
diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h index f9b69f7..d17e735 100644 --- a/NvCloth/src/cuda/CuCollision.h +++ b/NvCloth/src/cuda/CuCollision.h @@ -310,6 +310,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data) { if (threadIdx.x >= 192) return; + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); int32_t sphereIdx = threadIdx.x & 31; int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) @@ -325,7 +326,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data) Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx; // #pragma unroll for (int32_t i = 0; i < sGridSize; ++i, ++index) - dst[i] |= __ballot(int32_t(index) <= 0); + dst[i] |= __ballot_sync(mask, int32_t(index) <= 0); } // generate cone masks from sphere masks @@ -333,7 +334,7 @@ __device__ void CuCollision::buildConeAcceleration() { if (threadIdx.x >= 192) return; - + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); int32_t coneIdx = threadIdx.x & 31; uint32_t sphereMask = @@ -345,7 +346,7 @@ __device__ void CuCollision::buildConeAcceleration() // #pragma unroll for (int32_t i = 0; i < sGridSize; ++i) - dst[i] |= __ballot(src[i] & sphereMask); + dst[i] |= __ballot_sync(mask,src[i] & sphereMask); } // convert right/left mask arrays into single overlap array @@ -366,17 +367,17 @@ __device__ void CuCollision::mergeAcceleration() namespace { #if __CUDA_ARCH__ >= 300 -__device__ float mergeBounds(Pointer<Shared, float> buffer) +__device__ float mergeBounds(unsigned int mask, Pointer<Shared, float> buffer) { float value = *buffer; - value = max(value, __shfl_down(value, 1)); - value = max(value, __shfl_down(value, 2)); - value = max(value, __shfl_down(value, 4)); - value = max(value, __shfl_down(value, 8)); - return max(value, __shfl_down(value, 16)); + value = max(value, __shfl_down_sync(mask, value, 1)); + value = max(value, __shfl_down_sync(mask, value, 2)); + value = max(value, __shfl_down_sync(mask, value, 4)); + value = max(value, __shfl_down_sync(mask, value, 8)); + return max(value, __shfl_down_sync(mask, value, 16)); } #else -__device__ float mergeBounds(Pointer<Shared, float> buffer) +__device__ float mergeBounds(unsigned int mask, Pointer<Shared, float> buffer) { // ensure that writes to buffer are visible to all threads __threadfence_block(); @@ -393,6 +394,7 @@ __device__ float mergeBounds(Pointer<Shared, float> buffer) __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer) { assert(threadIdx.x < 192); + unsigned int mask = __ballot_sync(0xffffffff, threadIdx.x < 192); int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) @@ -401,7 +403,7 @@ __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Poi *buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; - return mergeBounds(buffer); + return mergeBounds(mask, buffer); } #if __CUDA_ARCH__ >= 300 @@ -415,6 +417,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, if (threadIdxInAxis < numThreadsPerAxis) { + unsigned int mask = __ballot_sync(0xffffffff, threadIdxInAxis < numThreadsPerAxis); typename CurrentT::ConstPointerType posIt = current[axis]; int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1); float minX = posIt[i], maxX = minX; @@ -425,16 +428,16 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, maxX = max(maxX, posX); } - minX = min(minX, __shfl_down(minX, 1)); - maxX = max(maxX, __shfl_down(maxX, 1)); - minX = min(minX, __shfl_down(minX, 2)); - maxX = max(maxX, __shfl_down(maxX, 2)); - minX = min(minX, __shfl_down(minX, 4)); - maxX = max(maxX, __shfl_down(maxX, 4)); - minX = min(minX, __shfl_down(minX, 8)); - maxX = max(maxX, __shfl_down(maxX, 8)); - minX = min(minX, __shfl_down(minX, 16)); - maxX = max(maxX, __shfl_down(maxX, 16)); + minX = min(minX, __shfl_down_sync(mask,minX, 1)); + maxX = max(maxX, __shfl_down_sync(mask,maxX, 1)); + minX = min(minX, __shfl_down_sync(mask,minX, 2)); + maxX = max(maxX, __shfl_down_sync(mask,maxX, 2)); + minX = min(minX, __shfl_down_sync(mask,minX, 4)); + maxX = max(maxX, __shfl_down_sync(mask,maxX, 4)); + minX = min(minX, __shfl_down_sync(mask,minX, 8)); + maxX = max(maxX, __shfl_down_sync(mask,maxX, 8)); + minX = min(minX, __shfl_down_sync(mask,minX, 16)); + maxX = max(maxX, __shfl_down_sync(mask,maxX, 16)); if (!laneIdx) { @@ -448,7 +451,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, if (threadIdx.x >= 192) return 0.0f; - + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); float value = *buffer; if (laneIdx >= (numThreadsPerAxis >> 5)) value = -FLT_MAX; @@ -456,10 +459,10 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, // blockDim.x <= 3 * 512, increase to 3 * 1024 by adding a shfl by 16 assert(numThreadsPerAxis <= 16 * 32); - value = max(value, __shfl_down(value, 1)); - value = max(value, __shfl_down(value, 2)); - value = max(value, __shfl_down(value, 4)); - return max(value, __shfl_down(value, 8)); + value = max(value, __shfl_down_sync(mask,value, 1)); + value = max(value, __shfl_down_sync(mask,value, 2)); + value = max(value, __shfl_down_sync(mask,value, 4)); + return max(value, __shfl_down_sync(mask,value, 8)); } #else template <typename CurrentT> @@ -467,6 +470,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, { if (threadIdx.x >= 192) return 0.0f; + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); int32_t axisIdx = threadIdx.x >> 6; // x, y, or z int32_t signi = threadIdx.x << 26; // sign bit (min or max) @@ -480,7 +484,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, while (pIt += 32, pIt < pEnd) *buffer = max(*buffer, *pIt * signf); - return mergeBounds(buffer); + return mergeBounds(mask, buffer); } #endif } diff --git a/NvCloth/src/cuda/CuDeviceVector.h b/NvCloth/src/cuda/CuDeviceVector.h index 4d97e5d..0fabb66 100644 --- a/NvCloth/src/cuda/CuDeviceVector.h +++ b/NvCloth/src/cuda/CuDeviceVector.h @@ -30,8 +30,8 @@ #pragma once #include "CuDevicePointer.h" -#include "PsArray.h" -#include "PsUtilities.h" +#include "NvCloth/ps/PsArray.h" +#include "../ps/PsUtilities.h" #include <algorithm> namespace nv diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp index 6794fa5..48cc0ba 100644 --- a/NvCloth/src/cuda/CuFabric.cpp +++ b/NvCloth/src/cuda/CuFabric.cpp @@ -30,7 +30,7 @@ #include "CuFabric.h" #include "CuContextLock.h" #include "CuFactory.h" -#include <PsUtilities.h> +#include "../ps/PsUtilities.h" #include <limits.h> using namespace physx; diff --git a/NvCloth/src/cuda/CuFactory.h b/NvCloth/src/cuda/CuFactory.h index 3ff5752..8fae913 100644 --- a/NvCloth/src/cuda/CuFactory.h +++ b/NvCloth/src/cuda/CuFactory.h @@ -30,7 +30,7 @@ #pragma once #include "NvCloth/Factory.h" -#include <PsArray.h> +#include "NvCloth/ps/PsArray.h" #include <foundation/PxVec4.h> #include <foundation/PxVec3.h> #include <cuda.h> diff --git a/NvCloth/src/cuda/CuSelfCollision.h b/NvCloth/src/cuda/CuSelfCollision.h index 19d2723..32aa139 100644 --- a/NvCloth/src/cuda/CuSelfCollision.h +++ b/NvCloth/src/cuda/CuSelfCollision.h @@ -39,7 +39,7 @@ namespace { -#if __CUDA_ARCH__ >= 300 +/*#if __CUDA_ARCH__ >= 300 template <int> __device__ void scanWarp(Pointer<Shared, int32_t> counts) { @@ -60,7 +60,7 @@ __device__ void scanWarp(Pointer<Shared, int32_t> counts) : "+r"(*generic(counts)) :); } -#else +#else*/ template <int stride> __device__ void scanWarp(Pointer<Shared, int32_t> counts) { @@ -77,7 +77,7 @@ __device__ void scanWarp(Pointer<Shared, int32_t> counts) if (laneIdx >= 16) *ptr += ptr[-16 * stride]; } -#endif +//#endif // sorts array by upper 16bits // [keys] must be at least 2 * n in length, in/out in first n elements @@ -112,10 +112,10 @@ __device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> his for (int32_t i = startIndex; i < endIndex; i += 32) { int32_t key = i < n ? srcKeys[i] >> p : 15; - uint32_t ballot1 = __ballot(key & 1); - uint32_t ballot2 = __ballot(key & 2); - uint32_t ballot4 = __ballot(key & 4); - uint32_t ballot8 = __ballot(key & 8); + uint32_t ballot1 = __ballot_sync(0xffffffff,key & 1); + uint32_t ballot2 = __ballot_sync(0xffffffff,key & 2); + uint32_t ballot4 = __ballot_sync(0xffffffff,key & 4); + uint32_t ballot8 = __ballot_sync(0xffffffff,key & 8); warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8)); } @@ -147,10 +147,10 @@ __device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> his for (int32_t i = startIndex; i < endIndex; i += 32) { int32_t key = i < n ? srcKeys[i] >> p : 15; - uint32_t ballot1 = __ballot(key & 1); - uint32_t ballot2 = __ballot(key & 2); - uint32_t ballot4 = __ballot(key & 4); - uint32_t ballot8 = __ballot(key & 8); + uint32_t ballot1 = __ballot_sync(0xffffffff,key & 1); + uint32_t ballot2 = __ballot_sync(0xffffffff,key & 2); + uint32_t ballot4 = __ballot_sync(0xffffffff,key & 4); + uint32_t ballot8 = __ballot_sync(0xffffffff,key & 8); uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) & (!!(key & 8) - 1 ^ ballot8); int32_t index = hIt[key & 15] + __popc(bits & laneMask); diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp index 7ef1d32..86f32b3 100644 --- a/NvCloth/src/cuda/CuSolver.cpp +++ b/NvCloth/src/cuda/CuSolver.cpp @@ -36,7 +36,7 @@ #include "CuContextLock.h" #include "CuCheckSuccess.h" #include "../IterationState.h" -#include <PsSort.h> +#include "../Ps/PsSort.h" #include <foundation/PxProfiler.h> #if NV_NVTX @@ -489,8 +489,9 @@ void cloth::CuSolver::beginFrame() uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31; if (mFactory.mNumThreadsPerBlock != numThreadsPerBlock) { - checkSuccess( - cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1)); + mFactory.mNumThreadsPerBlock = numThreadsPerBlock; + //checkSuccess( + // cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1)); } // remember num cloths per SM in terms of max shared memory per block diff --git a/NvCloth/src/cuda/CuSolverKernel.cu b/NvCloth/src/cuda/CuSolverKernel.cu index 1ad5896..86f038d 100644 --- a/NvCloth/src/cuda/CuSolverKernel.cu +++ b/NvCloth/src/cuda/CuSolverKernel.cu @@ -851,7 +851,7 @@ __device__ void applyImpulse(SharedParticleData::ParticleReferenceType pos, cons float scale = -pos.mReferences[3]; #if CONVERT_ADDRESSES - //Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory + // Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[0].mPtr), "f"(impulse.x * scale)); asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[1].mPtr), "f"(impulse.y * scale)); asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[2].mPtr), "f"(impulse.z * scale)); @@ -1394,6 +1394,7 @@ __launch_bounds__(512, 1) for (int32_t i = threadIdx.x; i < configDataSize; i += blockDim.x) gSharedUnsigned[i] = reinterpret_cast<const uint32_t*>(gClothData.mPhaseConfigs)[i]; + Pointer<Shared, uint32_t> scratchPtr = Pointer<Shared, uint32_t>( gSharedUnsigned + configDataSize + 4 * gFrameData.mNumSharedPositions * gClothData.mNumParticles); |