From d243404d4ba88bcf53f7310cc8980b4efe38c19f Mon Sep 17 00:00:00 2001 From: Marijn Tamis Date: Mon, 1 Apr 2019 14:21:09 +0200 Subject: 1.1.6 Release. --- NvCloth/src/cuda/CuCollision.h | 58 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'NvCloth/src/cuda/CuCollision.h') diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h index f9b69f7..d17e735 100644 --- a/NvCloth/src/cuda/CuCollision.h +++ b/NvCloth/src/cuda/CuCollision.h @@ -310,6 +310,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data) { if (threadIdx.x >= 192) return; + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); int32_t sphereIdx = threadIdx.x & 31; int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) @@ -325,7 +326,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data) Pointer dst = mShapeGrid + sGridSize * axisIdx; // #pragma unroll for (int32_t i = 0; i < sGridSize; ++i, ++index) - dst[i] |= __ballot(int32_t(index) <= 0); + dst[i] |= __ballot_sync(mask, int32_t(index) <= 0); } // generate cone masks from sphere masks @@ -333,7 +334,7 @@ __device__ void CuCollision::buildConeAcceleration() { if (threadIdx.x >= 192) return; - + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); int32_t coneIdx = threadIdx.x & 31; uint32_t sphereMask = @@ -345,7 +346,7 @@ __device__ void CuCollision::buildConeAcceleration() // #pragma unroll for (int32_t i = 0; i < sGridSize; ++i) - dst[i] |= __ballot(src[i] & sphereMask); + dst[i] |= __ballot_sync(mask,src[i] & sphereMask); } // convert right/left mask arrays into single overlap array @@ -366,17 +367,17 @@ __device__ void CuCollision::mergeAcceleration() namespace { #if __CUDA_ARCH__ >= 300 -__device__ float mergeBounds(Pointer buffer) +__device__ float mergeBounds(unsigned int mask, Pointer buffer) { float value = *buffer; - value = max(value, __shfl_down(value, 1)); - value = max(value, __shfl_down(value, 2)); - value = max(value, __shfl_down(value, 4)); - value = max(value, __shfl_down(value, 8)); - return max(value, __shfl_down(value, 16)); + value = max(value, __shfl_down_sync(mask, value, 1)); + value = max(value, __shfl_down_sync(mask, value, 2)); + value = max(value, __shfl_down_sync(mask, value, 4)); + value = max(value, __shfl_down_sync(mask, value, 8)); + return max(value, __shfl_down_sync(mask, value, 16)); } #else -__device__ float mergeBounds(Pointer buffer) +__device__ float mergeBounds(unsigned int mask, Pointer buffer) { // ensure that writes to buffer are visible to all threads __threadfence_block(); @@ -393,6 +394,7 @@ __device__ float mergeBounds(Pointer buffer) __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer buffer) { assert(threadIdx.x < 192); + unsigned int mask = __ballot_sync(0xffffffff, threadIdx.x < 192); int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) @@ -401,7 +403,7 @@ __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Poi *buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; - return mergeBounds(buffer); + return mergeBounds(mask, buffer); } #if __CUDA_ARCH__ >= 300 @@ -415,6 +417,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer= 192) return 0.0f; - + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); float value = *buffer; if (laneIdx >= (numThreadsPerAxis >> 5)) value = -FLT_MAX; @@ -456,10 +459,10 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer @@ -467,6 +470,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer= 192) return 0.0f; + unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192)); int32_t axisIdx = threadIdx.x >> 6; // x, y, or z int32_t signi = threadIdx.x << 26; // sign bit (min or max) @@ -480,7 +484,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer