7 files changed, 52 insertions, 46 deletions
diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h
index f9b69f7..d17e735 100644
--- a/NvCloth/src/cuda/CuCollision.h
+++ b/NvCloth/src/cuda/CuCollision.h
@@ -310,6 +310,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data)
 {
 	if (threadIdx.x >= 192)
 		return;
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 
 	int32_t sphereIdx = threadIdx.x & 31;
 	int32_t axisIdx = threadIdx.x >> 6;             // coordinate index (x, y, or z)
@@ -325,7 +326,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data)
 	Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx;
 	// #pragma unroll
 	for (int32_t i = 0; i < sGridSize; ++i, ++index)
-		dst[i] |= __ballot(int32_t(index) <= 0);
+		dst[i] |= __ballot_sync(mask, int32_t(index) <= 0);
 }
 
 // generate cone masks from sphere masks
@@ -333,7 +334,7 @@ __device__ void CuCollision::buildConeAcceleration()
 {
 	if (threadIdx.x >= 192)
 		return;
-
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 	int32_t coneIdx = threadIdx.x & 31;
 
 	uint32_t sphereMask =
@@ -345,7 +346,7 @@ __device__ void CuCollision::buildConeAcceleration()
 
 	// #pragma unroll
 	for (int32_t i = 0; i < sGridSize; ++i)
-		dst[i] |= __ballot(src[i] & sphereMask);
+		dst[i] |= __ballot_sync(mask,src[i] & sphereMask);
 }
 
 // convert right/left mask arrays into single overlap array
@@ -366,17 +367,17 @@ __device__ void CuCollision::mergeAcceleration()
 namespace
 {
 #if __CUDA_ARCH__ >= 300
-__device__ float mergeBounds(Pointer<Shared, float> buffer)
+__device__ float mergeBounds(unsigned int mask, Pointer<Shared, float> buffer)
 {
 	float value = *buffer;
-	value = max(value, __shfl_down(value, 1));
-	value = max(value, __shfl_down(value, 2));
-	value = max(value, __shfl_down(value, 4));
-	value = max(value, __shfl_down(value, 8));
-	return max(value, __shfl_down(value, 16));
+	value = max(value, __shfl_down_sync(mask, value, 1));
+	value = max(value, __shfl_down_sync(mask, value, 2));
+	value = max(value, __shfl_down_sync(mask, value, 4));
+	value = max(value, __shfl_down_sync(mask, value, 8));
+	return max(value, __shfl_down_sync(mask, value, 16));
 }
 #else
-__device__ float mergeBounds(Pointer<Shared, float> buffer)
+__device__ float mergeBounds(unsigned int mask, Pointer<Shared, float> buffer)
 {
 	// ensure that writes to buffer are visible to all threads
 	__threadfence_block();
@@ -393,6 +394,7 @@ __device__ float mergeBounds(Pointer<Shared, float> buffer)
 __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer)
 {
 	assert(threadIdx.x < 192);
+	unsigned int mask = __ballot_sync(0xffffffff, threadIdx.x < 192);
 
 	int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index
 	int32_t axisIdx = threadIdx.x >> 6;                                    // coordinate index (x, y, or z)
@@ -401,7 +403,7 @@ __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Poi
 
 	*buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx];
 
-	return mergeBounds(buffer);
+	return mergeBounds(mask, buffer);
 }
 
 #if __CUDA_ARCH__ >= 300
@@ -415,6 +417,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 
 	if (threadIdxInAxis < numThreadsPerAxis)
 	{
+		unsigned int mask = __ballot_sync(0xffffffff, threadIdxInAxis < numThreadsPerAxis);
 		typename CurrentT::ConstPointerType posIt = current[axis];
 		int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1);
 		float minX = posIt[i], maxX = minX;
@@ -425,16 +428,16 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 			maxX = max(maxX, posX);
 		}
 
-		minX = min(minX, __shfl_down(minX, 1));
-		maxX = max(maxX, __shfl_down(maxX, 1));
-		minX = min(minX, __shfl_down(minX, 2));
-		maxX = max(maxX, __shfl_down(maxX, 2));
-		minX = min(minX, __shfl_down(minX, 4));
-		maxX = max(maxX, __shfl_down(maxX, 4));
-		minX = min(minX, __shfl_down(minX, 8));
-		maxX = max(maxX, __shfl_down(maxX, 8));
-		minX = min(minX, __shfl_down(minX, 16));
-		maxX = max(maxX, __shfl_down(maxX, 16));
+		minX = min(minX, __shfl_down_sync(mask,minX, 1));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 1));
+		minX = min(minX, __shfl_down_sync(mask,minX, 2));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 2));
+		minX = min(minX, __shfl_down_sync(mask,minX, 4));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 4));
+		minX = min(minX, __shfl_down_sync(mask,minX, 8));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 8));
+		minX = min(minX, __shfl_down_sync(mask,minX, 16));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 16));
 
 		if (!laneIdx)
 		{
@@ -448,7 +451,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 
 	if (threadIdx.x >= 192)
 		return 0.0f;
-
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 	float value = *buffer;
 	if (laneIdx >= (numThreadsPerAxis >> 5))
 		value = -FLT_MAX;
@@ -456,10 +459,10 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 	// blockDim.x <= 3 * 512, increase to 3 * 1024 by adding a shfl by 16
 	assert(numThreadsPerAxis <= 16 * 32);
 
-	value = max(value, __shfl_down(value, 1));
-	value = max(value, __shfl_down(value, 2));
-	value = max(value, __shfl_down(value, 4));
-	return max(value, __shfl_down(value, 8));
+	value = max(value, __shfl_down_sync(mask,value, 1));
+	value = max(value, __shfl_down_sync(mask,value, 2));
+	value = max(value, __shfl_down_sync(mask,value, 4));
+	return max(value,  __shfl_down_sync(mask,value, 8));
 }
 #else
 template <typename CurrentT>
@@ -467,6 +470,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 {
 	if (threadIdx.x >= 192)
 		return 0.0f;
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 
 	int32_t axisIdx = threadIdx.x >> 6; // x, y, or z
 	int32_t signi = threadIdx.x << 26;  // sign bit (min or max)
@@ -480,7 +484,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 	while (pIt += 32, pIt < pEnd)
 		*buffer = max(*buffer, *pIt * signf);
 
-	return mergeBounds(buffer);
+	return mergeBounds(mask, buffer);
 }
 #endif
 }
diff --git a/NvCloth/src/cuda/CuDeviceVector.h b/NvCloth/src/cuda/CuDeviceVector.h
index 4d97e5d..0fabb66 100644
--- a/NvCloth/src/cuda/CuDeviceVector.h
+++ b/NvCloth/src/cuda/CuDeviceVector.h
@@ -30,8 +30,8 @@
 #pragma once
 
 #include "CuDevicePointer.h"
-#include "PsArray.h"
-#include "PsUtilities.h"
+#include "NvCloth/ps/PsArray.h"
+#include "../ps/PsUtilities.h"
 #include <algorithm>
 
 namespace nv
diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp
index 6794fa5..48cc0ba 100644
--- a/NvCloth/src/cuda/CuFabric.cpp
+++ b/NvCloth/src/cuda/CuFabric.cpp
@@ -30,7 +30,7 @@
 #include "CuFabric.h"
 #include "CuContextLock.h"
 #include "CuFactory.h"
-#include <PsUtilities.h>
+#include "../ps/PsUtilities.h"
 #include <limits.h>
 
 using namespace physx;
diff --git a/NvCloth/src/cuda/CuFactory.h b/NvCloth/src/cuda/CuFactory.h
index 3ff5752..8fae913 100644
--- a/NvCloth/src/cuda/CuFactory.h
+++ b/NvCloth/src/cuda/CuFactory.h
@@ -30,7 +30,7 @@
 #pragma once
 
 #include "NvCloth/Factory.h"
-#include <PsArray.h>
+#include "NvCloth/ps/PsArray.h"
 #include <foundation/PxVec4.h>
 #include <foundation/PxVec3.h>
 #include <cuda.h>
diff --git a/NvCloth/src/cuda/CuSelfCollision.h b/NvCloth/src/cuda/CuSelfCollision.h
index 19d2723..32aa139 100644
--- a/NvCloth/src/cuda/CuSelfCollision.h
+++ b/NvCloth/src/cuda/CuSelfCollision.h
@@ -39,7 +39,7 @@
 
 namespace
 {
-#if __CUDA_ARCH__ >= 300
+/*#if __CUDA_ARCH__ >= 300
 template <int>
 __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 {
@@ -60,7 +60,7 @@ __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 	             : "+r"(*generic(counts))
 	             :);
 }
-#else
+#else*/
 template <int stride>
 __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 {
@@ -77,7 +77,7 @@ __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 	if (laneIdx >= 16)
 		*ptr += ptr[-16 * stride];
 }
-#endif
+//#endif
 
 // sorts array by upper 16bits
 // [keys] must be at least 2 * n in length, in/out in first n elements
@@ -112,10 +112,10 @@ __device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> his
 		for (int32_t i = startIndex; i < endIndex; i += 32)
 		{
 			int32_t key = i < n ? srcKeys[i] >> p : 15;
-			uint32_t ballot1 = __ballot(key & 1);
-			uint32_t ballot2 = __ballot(key & 2);
-			uint32_t ballot4 = __ballot(key & 4);
-			uint32_t ballot8 = __ballot(key & 8);
+			uint32_t ballot1 = __ballot_sync(0xffffffff,key & 1);
+			uint32_t ballot2 = __ballot_sync(0xffffffff,key & 2);
+			uint32_t ballot4 = __ballot_sync(0xffffffff,key & 4);
+			uint32_t ballot8 = __ballot_sync(0xffffffff,key & 8);
 			warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
 		}
 
@@ -147,10 +147,10 @@ __device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> his
 		for (int32_t i = startIndex; i < endIndex; i += 32)
 		{
 			int32_t key = i < n ? srcKeys[i] >> p : 15;
-			uint32_t ballot1 = __ballot(key & 1);
-			uint32_t ballot2 = __ballot(key & 2);
-			uint32_t ballot4 = __ballot(key & 4);
-			uint32_t ballot8 = __ballot(key & 8);
+			uint32_t ballot1 = __ballot_sync(0xffffffff,key & 1);
+			uint32_t ballot2 = __ballot_sync(0xffffffff,key & 2);
+			uint32_t ballot4 = __ballot_sync(0xffffffff,key & 4);
+			uint32_t ballot8 = __ballot_sync(0xffffffff,key & 8);
 			uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) &
 			                (!!(key & 8) - 1 ^ ballot8);
 			int32_t index = hIt[key & 15] + __popc(bits & laneMask);
diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp
index 7ef1d32..86f32b3 100644
--- a/NvCloth/src/cuda/CuSolver.cpp
+++ b/NvCloth/src/cuda/CuSolver.cpp
@@ -36,7 +36,7 @@
 #include "CuContextLock.h"
 #include "CuCheckSuccess.h"
 #include "../IterationState.h"
-#include <PsSort.h>
+#include "../Ps/PsSort.h"
 #include <foundation/PxProfiler.h>
 
 #if NV_NVTX
@@ -489,8 +489,9 @@ void cloth::CuSolver::beginFrame()
 		uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31;
 		if (mFactory.mNumThreadsPerBlock != numThreadsPerBlock)
 		{
-			checkSuccess(
-			    cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1));
+			mFactory.mNumThreadsPerBlock = numThreadsPerBlock;
+			//checkSuccess(
+			//    cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1));
 		}
 
 		// remember num cloths per SM in terms of max shared memory per block
diff --git a/NvCloth/src/cuda/CuSolverKernel.cu b/NvCloth/src/cuda/CuSolverKernel.cu
index 1ad5896..86f038d 100644
--- a/NvCloth/src/cuda/CuSolverKernel.cu
+++ b/NvCloth/src/cuda/CuSolverKernel.cu
@@ -851,7 +851,7 @@ __device__ void applyImpulse(SharedParticleData::ParticleReferenceType pos, cons
 	float scale = -pos.mReferences[3];
 
 #if CONVERT_ADDRESSES
-	//Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory
+	// Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory
 	asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[0].mPtr), "f"(impulse.x * scale));
 	asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[1].mPtr), "f"(impulse.y * scale));
 	asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[2].mPtr), "f"(impulse.z * scale));
@@ -1394,6 +1394,7 @@ __launch_bounds__(512, 1)
 	for (int32_t i = threadIdx.x; i < configDataSize; i += blockDim.x)
 		gSharedUnsigned[i] = reinterpret_cast<const uint32_t*>(gClothData.mPhaseConfigs)[i];
 
+
 	Pointer<Shared, uint32_t> scratchPtr = Pointer<Shared, uint32_t>(
 	    gSharedUnsigned + configDataSize + 4 * gFrameData.mNumSharedPositions * gClothData.mNumParticles);