37 files changed, 2658 insertions, 66 deletions
diff --git a/NvCloth/src/ClothImpl.h b/NvCloth/src/ClothImpl.h
index 6686cd7..de8ac97 100644
--- a/NvCloth/src/ClothImpl.h
+++ b/NvCloth/src/ClothImpl.h
@@ -35,7 +35,7 @@
 #include <foundation/PxVec3.h>
 #include "IndexPair.h"
 #include "MovingAverage.h"
-#include <PsMathUtils.h>
+#include "NvCloth/ps/PsMathUtils.h"
 #include <cmath>
 
 namespace nv
diff --git a/NvCloth/src/IterationState.h b/NvCloth/src/IterationState.h
index be046b5..85bfacc 100644
--- a/NvCloth/src/IterationState.h
+++ b/NvCloth/src/IterationState.h
@@ -34,9 +34,9 @@
 #include <foundation/PxVec3.h>
 #include <foundation/PxMat44.h>
 #include <foundation/PxMat33.h>
-#include <PsMathUtils.h>
 #include "Vec4T.h"
 #include <algorithm>
+#include "NvCloth/ps/PsMathUtils.h"
 
 namespace nv
 {
diff --git a/NvCloth/src/PhaseConfig.cpp b/NvCloth/src/PhaseConfig.cpp
index b5db3b0..3397d5d 100644
--- a/NvCloth/src/PhaseConfig.cpp
+++ b/NvCloth/src/PhaseConfig.cpp
@@ -28,7 +28,7 @@
 // Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
 
 #include "NvCloth/PhaseConfig.h"
-#include "PsMathUtils.h"
+#include "NvCloth/ps/PsMathUtils.h"
 #include <algorithm>
 #include "ClothImpl.h"
 
diff --git a/NvCloth/src/SwClothData.cpp b/NvCloth/src/SwClothData.cpp
index 84d85d4..d346d98 100644
--- a/NvCloth/src/SwClothData.cpp
+++ b/NvCloth/src/SwClothData.cpp
@@ -31,7 +31,7 @@
 #include "SwCloth.h"
 #include "SwFabric.h"
 #include <foundation/Px.h>
-#include <PsUtilities.h>
+#include "ps/PsUtilities.h"
 
 using namespace physx;
 using namespace nv;
diff --git a/NvCloth/src/SwCollision.cpp b/NvCloth/src/SwCollision.cpp
index bd6cf7a..5c1466c 100644
--- a/NvCloth/src/SwCollision.cpp
+++ b/NvCloth/src/SwCollision.cpp
@@ -36,7 +36,7 @@
 #include "SwCollisionHelpers.h"
 #include <foundation/PxProfiler.h>
 #include <cstring> // for memset
-#include <PsSort.h>
+#include "ps/PsSort.h"
 
 using namespace nv;
 using namespace physx;
diff --git a/NvCloth/src/SwFabric.cpp b/NvCloth/src/SwFabric.cpp
index b8d617f..bf4b110 100644
--- a/NvCloth/src/SwFabric.cpp
+++ b/NvCloth/src/SwFabric.cpp
@@ -29,10 +29,10 @@
 
 #include "SwFabric.h"
 #include "SwFactory.h"
-#include "PsSort.h"
+#include "ps/PsSort.h"
 #include "limits.h" // for USHRT_MAX
 #include <algorithm>
-#include "PsUtilities.h"
+#include "../../src/ps/PsUtilities.h"
 
 using namespace nv;
 using namespace physx;
diff --git a/NvCloth/src/SwFabric.h b/NvCloth/src/SwFabric.h
index 29d0921..da2b9b5 100644
--- a/NvCloth/src/SwFabric.h
+++ b/NvCloth/src/SwFabric.h
@@ -57,7 +57,7 @@ class SwFabric : public Fabric
 #endif
 
 	SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phasesIndices, Range<const uint32_t> sets,
-	         Range<const float> restvalues, Range<const float> stiffnessValues, Range<const uint32_t> indices, Range<const uint32_t> anchors,
+			 Range<const float> restvalues, Range<const float> stiffnessValues, Range<const uint32_t> indices, Range<const uint32_t> anchors,
 	         Range<const float> tetherLengths, Range<const uint32_t> triangles, uint32_t id);
 
 	SwFabric& operator = (const SwFabric&);
diff --git a/NvCloth/src/SwFactory.cpp b/NvCloth/src/SwFactory.cpp
index 5c4b625..de4ff3b 100644
--- a/NvCloth/src/SwFactory.cpp
+++ b/NvCloth/src/SwFactory.cpp
@@ -55,12 +55,12 @@ cloth::SwFactory::~SwFactory()
 }
 
 cloth::Fabric* cloth::SwFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phaseIndices,
-                                              Range<const uint32_t> sets, Range<const float> restvalues, Range<const float> stiffnessValues,
-                                              Range<const uint32_t> indices, Range<const uint32_t> anchors,
+											  Range<const uint32_t> sets, Range<const float> restvalues, Range<const float> stiffnessValues,
+											  Range<const uint32_t> indices, Range<const uint32_t> anchors,
                                               Range<const float> tetherLengths, Range<const uint32_t> triangles)
 {
 	return NV_CLOTH_NEW(SwFabric)(*this, numParticles, phaseIndices, sets, restvalues, stiffnessValues, indices, anchors, tetherLengths, triangles,
-	                    getNextFabricId());
+								  getNextFabricId());
 }
 
 cloth::Cloth* cloth::SwFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
diff --git a/NvCloth/src/SwFactory.h b/NvCloth/src/SwFactory.h
index 4cbee0c..183efb9 100644
--- a/NvCloth/src/SwFactory.h
+++ b/NvCloth/src/SwFactory.h
@@ -56,7 +56,7 @@ class SwFactory : public Factory
 	virtual Platform getPlatform() const { return Platform::CPU; }
 
 	virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phaseIndices, Range<const uint32_t> sets,
-	                             Range<const float> restvalues, Range<const float> stiffnessValues, Range<const uint32_t> indices,
+								 Range<const float> restvalues, Range<const float> stiffnessValues, Range<const uint32_t> indices,
 	                             Range<const uint32_t> anchors, Range<const float> tetherLengths,
 	                             Range<const uint32_t> triangles);
 
diff --git a/NvCloth/src/SwInterCollision.cpp b/NvCloth/src/SwInterCollision.cpp
index bc46ea6..efa57ba 100644
--- a/NvCloth/src/SwInterCollision.cpp
+++ b/NvCloth/src/SwInterCollision.cpp
@@ -34,7 +34,7 @@
 #include <foundation/PxMat44.h>
 #include <foundation/PxBounds3.h>
 #include <algorithm>
-#include <PsSort.h>
+#include "ps/PsSort.h"
 #include "NvCloth/Allocator.h"
 
 using namespace nv;
diff --git a/NvCloth/src/SwSolver.cpp b/NvCloth/src/SwSolver.cpp
index f0f9152..a1f0cf8 100644
--- a/NvCloth/src/SwSolver.cpp
+++ b/NvCloth/src/SwSolver.cpp
@@ -36,8 +36,8 @@
 #include "SwClothData.h"
 #include "SwSolverKernel.h"
 #include "SwInterCollision.h"
-#include <PsFPU.h>
-#include <PsSort.h>
+#include "ps/PsFPU.h"
+#include "ps/PsSort.h"
 
 using namespace physx;
 
diff --git a/NvCloth/src/TripletScheduler.cpp b/NvCloth/src/TripletScheduler.cpp
index 0116200..10ad430 100644
--- a/NvCloth/src/TripletScheduler.cpp
+++ b/NvCloth/src/TripletScheduler.cpp
@@ -29,7 +29,7 @@
 
 #include "TripletScheduler.h"
 #include <algorithm>
-#include <PsUtilities.h>
+#include "../../src/ps/PsUtilities.h"
 
 using namespace physx;
 using namespace nv;
diff --git a/NvCloth/src/cuda/CuCollision.h b/NvCloth/src/cuda/CuCollision.h
index f9b69f7..d17e735 100644
--- a/NvCloth/src/cuda/CuCollision.h
+++ b/NvCloth/src/cuda/CuCollision.h
@@ -310,6 +310,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data)
 {
 	if (threadIdx.x >= 192)
 		return;
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 
 	int32_t sphereIdx = threadIdx.x & 31;
 	int32_t axisIdx = threadIdx.x >> 6;             // coordinate index (x, y, or z)
@@ -325,7 +326,7 @@ __device__ void CuCollision::buildSphereAcceleration(const CollisionData& data)
 	Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx;
 	// #pragma unroll
 	for (int32_t i = 0; i < sGridSize; ++i, ++index)
-		dst[i] |= __ballot(int32_t(index) <= 0);
+		dst[i] |= __ballot_sync(mask, int32_t(index) <= 0);
 }
 
 // generate cone masks from sphere masks
@@ -333,7 +334,7 @@ __device__ void CuCollision::buildConeAcceleration()
 {
 	if (threadIdx.x >= 192)
 		return;
-
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 	int32_t coneIdx = threadIdx.x & 31;
 
 	uint32_t sphereMask =
@@ -345,7 +346,7 @@ __device__ void CuCollision::buildConeAcceleration()
 
 	// #pragma unroll
 	for (int32_t i = 0; i < sGridSize; ++i)
-		dst[i] |= __ballot(src[i] & sphereMask);
+		dst[i] |= __ballot_sync(mask,src[i] & sphereMask);
 }
 
 // convert right/left mask arrays into single overlap array
@@ -366,17 +367,17 @@ __device__ void CuCollision::mergeAcceleration()
 namespace
 {
 #if __CUDA_ARCH__ >= 300
-__device__ float mergeBounds(Pointer<Shared, float> buffer)
+__device__ float mergeBounds(unsigned int mask, Pointer<Shared, float> buffer)
 {
 	float value = *buffer;
-	value = max(value, __shfl_down(value, 1));
-	value = max(value, __shfl_down(value, 2));
-	value = max(value, __shfl_down(value, 4));
-	value = max(value, __shfl_down(value, 8));
-	return max(value, __shfl_down(value, 16));
+	value = max(value, __shfl_down_sync(mask, value, 1));
+	value = max(value, __shfl_down_sync(mask, value, 2));
+	value = max(value, __shfl_down_sync(mask, value, 4));
+	value = max(value, __shfl_down_sync(mask, value, 8));
+	return max(value, __shfl_down_sync(mask, value, 16));
 }
 #else
-__device__ float mergeBounds(Pointer<Shared, float> buffer)
+__device__ float mergeBounds(unsigned int mask, Pointer<Shared, float> buffer)
 {
 	// ensure that writes to buffer are visible to all threads
 	__threadfence_block();
@@ -393,6 +394,7 @@ __device__ float mergeBounds(Pointer<Shared, float> buffer)
 __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer)
 {
 	assert(threadIdx.x < 192);
+	unsigned int mask = __ballot_sync(0xffffffff, threadIdx.x < 192);
 
 	int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index
 	int32_t axisIdx = threadIdx.x >> 6;                                    // coordinate index (x, y, or z)
@@ -401,7 +403,7 @@ __device__ float computeSphereBounds(const CuCollision::CollisionData& data, Poi
 
 	*buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx];
 
-	return mergeBounds(buffer);
+	return mergeBounds(mask, buffer);
 }
 
 #if __CUDA_ARCH__ >= 300
@@ -415,6 +417,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 
 	if (threadIdxInAxis < numThreadsPerAxis)
 	{
+		unsigned int mask = __ballot_sync(0xffffffff, threadIdxInAxis < numThreadsPerAxis);
 		typename CurrentT::ConstPointerType posIt = current[axis];
 		int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1);
 		float minX = posIt[i], maxX = minX;
@@ -425,16 +428,16 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 			maxX = max(maxX, posX);
 		}
 
-		minX = min(minX, __shfl_down(minX, 1));
-		maxX = max(maxX, __shfl_down(maxX, 1));
-		minX = min(minX, __shfl_down(minX, 2));
-		maxX = max(maxX, __shfl_down(maxX, 2));
-		minX = min(minX, __shfl_down(minX, 4));
-		maxX = max(maxX, __shfl_down(maxX, 4));
-		minX = min(minX, __shfl_down(minX, 8));
-		maxX = max(maxX, __shfl_down(maxX, 8));
-		minX = min(minX, __shfl_down(minX, 16));
-		maxX = max(maxX, __shfl_down(maxX, 16));
+		minX = min(minX, __shfl_down_sync(mask,minX, 1));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 1));
+		minX = min(minX, __shfl_down_sync(mask,minX, 2));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 2));
+		minX = min(minX, __shfl_down_sync(mask,minX, 4));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 4));
+		minX = min(minX, __shfl_down_sync(mask,minX, 8));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 8));
+		minX = min(minX, __shfl_down_sync(mask,minX, 16));
+		maxX = max(maxX, __shfl_down_sync(mask,maxX, 16));
 
 		if (!laneIdx)
 		{
@@ -448,7 +451,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 
 	if (threadIdx.x >= 192)
 		return 0.0f;
-
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 	float value = *buffer;
 	if (laneIdx >= (numThreadsPerAxis >> 5))
 		value = -FLT_MAX;
@@ -456,10 +459,10 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 	// blockDim.x <= 3 * 512, increase to 3 * 1024 by adding a shfl by 16
 	assert(numThreadsPerAxis <= 16 * 32);
 
-	value = max(value, __shfl_down(value, 1));
-	value = max(value, __shfl_down(value, 2));
-	value = max(value, __shfl_down(value, 4));
-	return max(value, __shfl_down(value, 8));
+	value = max(value, __shfl_down_sync(mask,value, 1));
+	value = max(value, __shfl_down_sync(mask,value, 2));
+	value = max(value, __shfl_down_sync(mask,value, 4));
+	return max(value,  __shfl_down_sync(mask,value, 8));
 }
 #else
 template <typename CurrentT>
@@ -467,6 +470,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 {
 	if (threadIdx.x >= 192)
 		return 0.0f;
+	unsigned int mask = __ballot_sync(0xffffffff, !(threadIdx.x >= 192));
 
 	int32_t axisIdx = threadIdx.x >> 6; // x, y, or z
 	int32_t signi = threadIdx.x << 26;  // sign bit (min or max)
@@ -480,7 +484,7 @@ __device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared,
 	while (pIt += 32, pIt < pEnd)
 		*buffer = max(*buffer, *pIt * signf);
 
-	return mergeBounds(buffer);
+	return mergeBounds(mask, buffer);
 }
 #endif
 }
diff --git a/NvCloth/src/cuda/CuDeviceVector.h b/NvCloth/src/cuda/CuDeviceVector.h
index 4d97e5d..0fabb66 100644
--- a/NvCloth/src/cuda/CuDeviceVector.h
+++ b/NvCloth/src/cuda/CuDeviceVector.h
@@ -30,8 +30,8 @@
 #pragma once
 
 #include "CuDevicePointer.h"
-#include "PsArray.h"
-#include "PsUtilities.h"
+#include "NvCloth/ps/PsArray.h"
+#include "../ps/PsUtilities.h"
 #include <algorithm>
 
 namespace nv
diff --git a/NvCloth/src/cuda/CuFabric.cpp b/NvCloth/src/cuda/CuFabric.cpp
index 6794fa5..48cc0ba 100644
--- a/NvCloth/src/cuda/CuFabric.cpp
+++ b/NvCloth/src/cuda/CuFabric.cpp
@@ -30,7 +30,7 @@
 #include "CuFabric.h"
 #include "CuContextLock.h"
 #include "CuFactory.h"
-#include <PsUtilities.h>
+#include "../ps/PsUtilities.h"
 #include <limits.h>
 
 using namespace physx;
diff --git a/NvCloth/src/cuda/CuFactory.h b/NvCloth/src/cuda/CuFactory.h
index 3ff5752..8fae913 100644
--- a/NvCloth/src/cuda/CuFactory.h
+++ b/NvCloth/src/cuda/CuFactory.h
@@ -30,7 +30,7 @@
 #pragma once
 
 #include "NvCloth/Factory.h"
-#include <PsArray.h>
+#include "NvCloth/ps/PsArray.h"
 #include <foundation/PxVec4.h>
 #include <foundation/PxVec3.h>
 #include <cuda.h>
diff --git a/NvCloth/src/cuda/CuSelfCollision.h b/NvCloth/src/cuda/CuSelfCollision.h
index 19d2723..32aa139 100644
--- a/NvCloth/src/cuda/CuSelfCollision.h
+++ b/NvCloth/src/cuda/CuSelfCollision.h
@@ -39,7 +39,7 @@
 
 namespace
 {
-#if __CUDA_ARCH__ >= 300
+/*#if __CUDA_ARCH__ >= 300
 template <int>
 __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 {
@@ -60,7 +60,7 @@ __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 	             : "+r"(*generic(counts))
 	             :);
 }
-#else
+#else*/
 template <int stride>
 __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 {
@@ -77,7 +77,7 @@ __device__ void scanWarp(Pointer<Shared, int32_t> counts)
 	if (laneIdx >= 16)
 		*ptr += ptr[-16 * stride];
 }
-#endif
+//#endif
 
 // sorts array by upper 16bits
 // [keys] must be at least 2 * n in length, in/out in first n elements
@@ -112,10 +112,10 @@ __device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> his
 		for (int32_t i = startIndex; i < endIndex; i += 32)
 		{
 			int32_t key = i < n ? srcKeys[i] >> p : 15;
-			uint32_t ballot1 = __ballot(key & 1);
-			uint32_t ballot2 = __ballot(key & 2);
-			uint32_t ballot4 = __ballot(key & 4);
-			uint32_t ballot8 = __ballot(key & 8);
+			uint32_t ballot1 = __ballot_sync(0xffffffff,key & 1);
+			uint32_t ballot2 = __ballot_sync(0xffffffff,key & 2);
+			uint32_t ballot4 = __ballot_sync(0xffffffff,key & 4);
+			uint32_t ballot8 = __ballot_sync(0xffffffff,key & 8);
 			warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8));
 		}
 
@@ -147,10 +147,10 @@ __device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> his
 		for (int32_t i = startIndex; i < endIndex; i += 32)
 		{
 			int32_t key = i < n ? srcKeys[i] >> p : 15;
-			uint32_t ballot1 = __ballot(key & 1);
-			uint32_t ballot2 = __ballot(key & 2);
-			uint32_t ballot4 = __ballot(key & 4);
-			uint32_t ballot8 = __ballot(key & 8);
+			uint32_t ballot1 = __ballot_sync(0xffffffff,key & 1);
+			uint32_t ballot2 = __ballot_sync(0xffffffff,key & 2);
+			uint32_t ballot4 = __ballot_sync(0xffffffff,key & 4);
+			uint32_t ballot8 = __ballot_sync(0xffffffff,key & 8);
 			uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) &
 			                (!!(key & 8) - 1 ^ ballot8);
 			int32_t index = hIt[key & 15] + __popc(bits & laneMask);
diff --git a/NvCloth/src/cuda/CuSolver.cpp b/NvCloth/src/cuda/CuSolver.cpp
index 7ef1d32..86f32b3 100644
--- a/NvCloth/src/cuda/CuSolver.cpp
+++ b/NvCloth/src/cuda/CuSolver.cpp
@@ -36,7 +36,7 @@
 #include "CuContextLock.h"
 #include "CuCheckSuccess.h"
 #include "../IterationState.h"
-#include <PsSort.h>
+#include "../Ps/PsSort.h"
 #include <foundation/PxProfiler.h>
 
 #if NV_NVTX
@@ -489,8 +489,9 @@ void cloth::CuSolver::beginFrame()
 		uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31;
 		if (mFactory.mNumThreadsPerBlock != numThreadsPerBlock)
 		{
-			checkSuccess(
-			    cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1));
+			mFactory.mNumThreadsPerBlock = numThreadsPerBlock;
+			//checkSuccess(
+			//    cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1));
 		}
 
 		// remember num cloths per SM in terms of max shared memory per block
diff --git a/NvCloth/src/cuda/CuSolverKernel.cu b/NvCloth/src/cuda/CuSolverKernel.cu
index 1ad5896..86f038d 100644
--- a/NvCloth/src/cuda/CuSolverKernel.cu
+++ b/NvCloth/src/cuda/CuSolverKernel.cu
@@ -851,7 +851,7 @@ __device__ void applyImpulse(SharedParticleData::ParticleReferenceType pos, cons
 	float scale = -pos.mReferences[3];
 
 #if CONVERT_ADDRESSES
-	//Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory
+	// Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory
 	asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[0].mPtr), "f"(impulse.x * scale));
 	asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[1].mPtr), "f"(impulse.y * scale));
 	asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[2].mPtr), "f"(impulse.z * scale));
@@ -1394,6 +1394,7 @@ __launch_bounds__(512, 1)
 	for (int32_t i = threadIdx.x; i < configDataSize; i += blockDim.x)
 		gSharedUnsigned[i] = reinterpret_cast<const uint32_t*>(gClothData.mPhaseConfigs)[i];
 
+
 	Pointer<Shared, uint32_t> scratchPtr = Pointer<Shared, uint32_t>(
 	    gSharedUnsigned + configDataSize + 4 * gFrameData.mNumSharedPositions * gClothData.mNumParticles);
 
diff --git a/NvCloth/src/dx/DxFabric.cpp b/NvCloth/src/dx/DxFabric.cpp
index cf6865a..f5adeaf 100644
--- a/NvCloth/src/dx/DxFabric.cpp
+++ b/NvCloth/src/dx/DxFabric.cpp
@@ -31,7 +31,7 @@
 #include "DxContextLock.h"
 #include "DxFactory.h"
 #include <algorithm>
-#include <PsUtilities.h>
+#include "ps/PsUtilities.h"
 
 #if NV_CLOTH_ENABLE_DX11
 
diff --git a/NvCloth/src/dx/DxSolver.cpp b/NvCloth/src/dx/DxSolver.cpp
index 21caa0b..c341aa8 100644
--- a/NvCloth/src/dx/DxSolver.cpp
+++ b/NvCloth/src/dx/DxSolver.cpp
@@ -34,7 +34,7 @@
 #include "DxFactory.h"
 #include "DxContextLock.h"
 #include "../IterationState.h"
-#include <PsSort.h>
+#include "../ps/PsSort.h"
 #include <foundation/PxProfiler.h>
 
 #if NV_CLOTH_ENABLE_DX11
diff --git a/NvCloth/src/neon/NeonSolverKernel.cpp b/NvCloth/src/neon/NeonSolverKernel.cpp
index 3e16b6f..6eb26cf 100644
--- a/NvCloth/src/neon/NeonSolverKernel.cpp
+++ b/NvCloth/src/neon/NeonSolverKernel.cpp
@@ -33,7 +33,7 @@
 
 #include "SwSolverKernel.cpp"
 
-#include <cpu-features.h>
+#include "../ps/android/cpu-features.h"
 
 namespace
 {
diff --git a/NvCloth/src/ps/PsAlloca.h b/NvCloth/src/ps/PsAlloca.h
new file mode 100644
index 0000000..75a7bb2
--- /dev/null
+++ b/NvCloth/src/ps/PsAlloca.h
@@ -0,0 +1,76 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSALLOCA_H
+#define PSFOUNDATION_PSALLOCA_H
+
+#include "NvCloth/Allocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename T, typename Unused = void>
+class ScopedPointer
+{
+  public:
+	~ScopedPointer()
+	{
+		if(mOwned)
+			GetNvClothAllocator()->deallocate(mPointer);
+	}
+
+	operator T*() const
+	{
+		return mPointer;
+	}
+
+	T* mPointer;
+	bool mOwned;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+/*! Stack allocation for \c count instances of \c type. Falling back to temp allocator if using more than 1kB. */
+#ifdef __SPU__
+#define PX_ALLOCA(var, type, count) type* var = reinterpret_cast<type*>(PxAlloca(sizeof(type) * (count)))
+#else
+#define PX_ALLOCA(var, type, count)                                                                                    \
+	physx::shdfnd::ScopedPointer<type> var;                                                                            \
+	{                                                                                                                  \
+		uint32_t size = sizeof(type) * (count);                                                                        \
+		var.mOwned = size > 1024;                                                                                      \
+		if(var.mOwned)                                                                                                 \
+			var.mPointer = reinterpret_cast<type*>(GetNvClothAllocator()->allocate(size,#type,__FILE__,__LINE__));	   \
+		else                                                                                                           \
+			var.mPointer = reinterpret_cast<type*>(PxAlloca(size));                                                    \
+	}
+#endif
+#endif // #ifndef PSFOUNDATION_PSALLOCA_H
diff --git a/NvCloth/src/ps/PsFPU.h b/NvCloth/src/ps/PsFPU.h
new file mode 100644
index 0000000..82b7ff2
--- /dev/null
+++ b/NvCloth/src/ps/PsFPU.h
@@ -0,0 +1,103 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSFPU_H
+#define PSFOUNDATION_PSFPU_H
+
+#include "NvCloth/ps/Ps.h"
+#include "NvCloth/ps/PsIntrinsics.h"
+
+#define PX_IR(x) ((uint32_t&)(x))
+#define PX_SIR(x) ((int32_t&)(x))
+#define PX_FR(x) ((float&)(x))
+
+// signed integer representation of a floating-point value.
+
+// Floating-point representation of a integer value.
+
+#define PX_SIGN_BITMASK 0x80000000
+
+#define PX_FPU_GUARD shdfnd::FPUGuard scopedFpGuard;
+#define PX_SIMD_GUARD shdfnd::SIMDGuard scopedFpGuard;
+
+#define PX_SUPPORT_GUARDS (PX_WINDOWS_FAMILY || PX_XBOXONE || (PX_LINUX && (PX_X86 || PX_X64)) || PX_PS4 || PX_OSX)
+
+namespace physx
+{
+namespace shdfnd
+{
+// sets the default SDK state for scalar and SIMD units
+class NV_CLOTH_IMPORT FPUGuard
+{
+  public:
+	FPUGuard();  // set fpu control word for PhysX
+	~FPUGuard(); // restore fpu control word
+  private:
+	uint32_t mControlWords[8];
+};
+
+// sets default SDK state for simd unit only, lighter weight than FPUGuard
+class SIMDGuard
+{
+  public:
+	PX_INLINE SIMDGuard();  // set simd control word for PhysX
+	PX_INLINE ~SIMDGuard(); // restore simd control word
+  private:
+#if PX_SUPPORT_GUARDS
+	uint32_t mControlWord;
+#endif
+};
+
+/**
+\brief Enables floating point exceptions for the scalar and SIMD unit
+*/
+NV_CLOTH_IMPORT void enableFPExceptions();
+
+/**
+\brief Disables floating point exceptions for the scalar and SIMD unit
+*/
+NV_CLOTH_IMPORT void disableFPExceptions();
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_WINDOWS_FAMILY || PX_XBOXONE
+#include "windows/PsWindowsFPU.h"
+#elif (PX_LINUX && PX_SSE2) || PX_PS4 || PX_OSX
+#include "unix/PsUnixFPU.h"
+#else
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+}
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+}
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSFPU_H
diff --git a/NvCloth/src/ps/PsSort.h b/NvCloth/src/ps/PsSort.h
new file mode 100644
index 0000000..8667244
--- /dev/null
+++ b/NvCloth/src/ps/PsSort.h
@@ -0,0 +1,130 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSORT_H
+#define PSFOUNDATION_PSSORT_H
+
+/** \addtogroup foundation
+@{
+*/
+
+#include "PsSortInternals.h"
+#include "PsAlloca.h"
+
+#define PX_SORT_PARANOIA PX_DEBUG
+
+/**
+\brief Sorts an array of objects in ascending order, assuming
+that the predicate implements the < operator:
+
+\see Less, Greater
+*/
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4706) // disable the warning that we did an assignment within a conditional expression, as
+// this was intentional.
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class T, class Predicate, class Allocator>
+void sort(T* elements, uint32_t count, const Predicate& compare, const Allocator& inAllocator,
+          const uint32_t initialStackSize = 32)
+{
+	static const uint32_t SMALL_SORT_CUTOFF = 5; // must be >= 3 since we need 3 for median
+
+	PX_ALLOCA(stackMem, int32_t, initialStackSize);
+	internal::Stack<Allocator> stack(stackMem, initialStackSize, inAllocator);
+
+	int32_t first = 0, last = int32_t(count - 1);
+	if(last > first)
+	{
+		for(;;)
+		{
+			while(last > first)
+			{
+				NV_CLOTH_ASSERT(first >= 0 && last < int32_t(count));
+				if(uint32_t(last - first) < SMALL_SORT_CUTOFF)
+				{
+					internal::smallSort(elements, first, last, compare);
+					break;
+				}
+				else
+				{
+					const int32_t partIndex = internal::partition(elements, first, last, compare);
+
+					// push smaller sublist to minimize stack usage
+					if((partIndex - first) < (last - partIndex))
+					{
+						stack.push(first, partIndex - 1);
+						first = partIndex + 1;
+					}
+					else
+					{
+						stack.push(partIndex + 1, last);
+						last = partIndex - 1;
+					}
+				}
+			}
+
+			if(stack.empty())
+				break;
+
+			stack.pop(first, last);
+		}
+	}
+#if PX_SORT_PARANOIA
+	for(uint32_t i = 1; i < count; i++)
+		NV_CLOTH_ASSERT(!compare(elements[i], elements[i - 1]));
+#endif
+}
+
+template <class T, class Predicate>
+void sort(T* elements, uint32_t count, const Predicate& compare)
+{
+	sort(elements, count, compare, typename shdfnd::AllocatorTraits<T>::Type());
+}
+
+template <class T>
+void sort(T* elements, uint32_t count)
+{
+	sort(elements, count, shdfnd::Less<T>(), typename shdfnd::AllocatorTraits<T>::Type());
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSSORT_H
diff --git a/NvCloth/src/ps/PsSortInternals.h b/NvCloth/src/ps/PsSortInternals.h
new file mode 100644
index 0000000..c7a7703
--- /dev/null
+++ b/NvCloth/src/ps/PsSortInternals.h
@@ -0,0 +1,187 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSORTINTERNALS_H
+#define PSFOUNDATION_PSSORTINTERNALS_H
+
+/** \addtogroup foundation
+@{
+*/
+
+#include "ps/PxIntrinsics.h"
+#include "NvCloth/ps/PsBasicTemplates.h"
+#include "NvCloth/ps/PsUserAllocated.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace internal
+{
+template <class T, class Predicate>
+PX_INLINE void median3(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	/*
+	This creates sentinels because we know there is an element at the start minimum(or equal)
+	than the pivot and an element at the end greater(or equal) than the pivot. Plus the
+	median of 3 reduces the chance of degenerate behavour.
+	*/
+
+	int32_t mid = (first + last) / 2;
+
+	if(compare(elements[mid], elements[first]))
+		swap(elements[first], elements[mid]);
+
+	if(compare(elements[last], elements[first]))
+		swap(elements[first], elements[last]);
+
+	if(compare(elements[last], elements[mid]))
+		swap(elements[mid], elements[last]);
+
+	// keep the pivot at last-1
+	swap(elements[mid], elements[last - 1]);
+}
+
+template <class T, class Predicate>
+PX_INLINE int32_t partition(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	median3(elements, first, last, compare);
+
+	/*
+	WARNING: using the line:
+
+	T partValue = elements[last-1];
+
+	and changing the scan loops to:
+
+	while(comparator.greater(partValue, elements[++i]));
+	while(comparator.greater(elements[--j], partValue);
+
+	triggers a compiler optimizer bug on xenon where it stores a double to the stack for partValue
+	then loads it as a single...:-(
+	*/
+
+	int32_t i = first;    // we know first is less than pivot(but i gets pre incremented)
+	int32_t j = last - 1; // pivot is in last-1 (but j gets pre decremented)
+
+	for(;;)
+	{
+		while(compare(elements[++i], elements[last - 1]))
+			;
+		while(compare(elements[last - 1], elements[--j]))
+			;
+
+		if(i >= j)
+			break;
+
+		NV_CLOTH_ASSERT(i <= last && j >= first);
+		swap(elements[i], elements[j]);
+	}
+	// put the pivot in place
+
+	NV_CLOTH_ASSERT(i <= last && first <= (last - 1));
+	swap(elements[i], elements[last - 1]);
+
+	return i;
+}
+
+template <class T, class Predicate>
+PX_INLINE void smallSort(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	// selection sort - could reduce to fsel on 360 with floats.
+
+	for(int32_t i = first; i < last; i++)
+	{
+		int32_t m = i;
+		for(int32_t j = i + 1; j <= last; j++)
+			if(compare(elements[j], elements[m]))
+				m = j;
+
+		if(m != i)
+			swap(elements[m], elements[i]);
+	}
+}
+
+template <class Allocator>
+class Stack
+{
+	Allocator mAllocator;
+	uint32_t mSize, mCapacity;
+	int32_t* mMemory;
+	bool mRealloc;
+
+  public:
+	Stack(int32_t* memory, uint32_t capacity, const Allocator& inAllocator)
+	: mAllocator(inAllocator), mSize(0), mCapacity(capacity), mMemory(memory), mRealloc(false)
+	{
+	}
+	~Stack()
+	{
+		if(mRealloc)
+			mAllocator.deallocate(mMemory);
+	}
+
+	void grow()
+	{
+		mCapacity *= 2;
+		int32_t* newMem =
+		    reinterpret_cast<int32_t*>(mAllocator.allocate(sizeof(int32_t) * mCapacity, __FILE__, __LINE__));
+		intrinsics::memCopy(newMem, mMemory, mSize * sizeof(int32_t));
+		if(mRealloc)
+			mAllocator.deallocate(mMemory);
+		mRealloc = true;
+		mMemory = newMem;
+	}
+
+	PX_INLINE void push(int32_t start, int32_t end)
+	{
+		if(mSize >= mCapacity - 1)
+			grow();
+		mMemory[mSize++] = start;
+		mMemory[mSize++] = end;
+	}
+
+	PX_INLINE void pop(int32_t& start, int32_t& end)
+	{
+		NV_CLOTH_ASSERT(!empty());
+		end = mMemory[--mSize];
+		start = mMemory[--mSize];
+	}
+
+	PX_INLINE bool empty()
+	{
+		return mSize == 0;
+	}
+};
+} // namespace internal
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSSORTINTERNALS_H
diff --git a/NvCloth/src/ps/PsUtilities.h b/NvCloth/src/ps/PsUtilities.h
new file mode 100644
index 0000000..4e59ddc
--- /dev/null
+++ b/NvCloth/src/ps/PsUtilities.h
@@ -0,0 +1,169 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUTILITIES_H
+#define PSFOUNDATION_PSUTILITIES_H
+
+#include "foundation/PxVec3.h"
+#include "NvCloth/ps/Ps.h"
+#include "NvCloth/ps/PsIntrinsics.h"
+#include "NvCloth/ps/PsBasicTemplates.h"
+#include "NvCloth/Callbacks.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+PX_INLINE char littleEndian()
+{
+	int i = 1;
+	return *(reinterpret_cast<char*>(&i));
+}
+
+// PT: checked casts
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 to32(PxU64 value)
+{
+	NV_CLOTH_ASSERT(value <= 0xffffffff);
+	return PxU32(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU16 to16(PxU32 value)
+{
+	NV_CLOTH_ASSERT(value <= 0xffff);
+	return PxU16(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU16 value)
+{
+	NV_CLOTH_ASSERT(value <= 0xff);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU32 value)
+{
+	NV_CLOTH_ASSERT(value <= 0xff);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxI32 value)
+{
+	NV_CLOTH_ASSERT(value <= 0xff);
+	NV_CLOTH_ASSERT(value >= 0);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxI8 toI8(PxU32 value)
+{
+	NV_CLOTH_ASSERT(value <= 0x7f);
+	return PxI8(value);
+}
+
+/*!
+Get number of elements in array
+*/
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define PX_ARRAY_SIZE(_array) (sizeof(physx::shdfnd::ArraySizeHelper(_array)))
+
+/*!
+Sort two elements using operator<
+
+On return x will be the smaller of the two
+*/
+template <class T>
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y)
+{
+	if(y < x)
+		swap(x, y);
+}
+
+// most architectures can do predication on real comparisons, and on VMX, it matters
+
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(PxReal& x, PxReal& y)
+{
+	PxReal newX = PxMin(x, y);
+	PxReal newY = PxMax(x, y);
+	x = newX;
+	y = newY;
+}
+
+/*!
+Sort two elements using operator< and also keep order
+of any extra data
+*/
+template <class T, class E1>
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y, E1& xe1, E1& ye1)
+{
+	if(y < x)
+	{
+		swap(x, y);
+		swap(xe1, ye1);
+	}
+}
+
+#if PX_GCC_FAMILY && !PX_EMSCRIPTEN && !PX_LINUX
+__attribute__((noreturn))
+#endif
+    PX_INLINE void debugBreak()
+{
+#if PX_WINDOWS || PX_XBOXONE
+	__debugbreak();
+#elif PX_ANDROID
+	raise(SIGTRAP); // works better than __builtin_trap. Proper call stack and can be continued.
+#elif PX_LINUX
+	#if (PX_X64 || PX_X64)
+		asm("int $3");
+	#else
+		raise(SIGTRAP);
+	#endif
+#elif PX_GCC_FAMILY
+	__builtin_trap();
+#else
+	NV_CLOTH_ASSERT(false);
+#endif
+}
+
+bool checkValid(const float&);
+bool checkValid(const PxVec3&);
+bool checkValid(const PxQuat&);
+bool checkValid(const PxMat33&);
+bool checkValid(const PxTransform&);
+bool checkValid(const char*);
+
+// equivalent to std::max_element
+template <typename T>
+inline const T* maxElement(const T* first, const T* last)
+{
+	const T* m = first;
+	for(const T* it = first + 1; it < last; ++it)
+		if(*m < *it)
+			m = it;
+
+	return m;
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif
diff --git a/NvCloth/src/ps/PxIntrinsics.h b/NvCloth/src/ps/PxIntrinsics.h
new file mode 100644
index 0000000..b4aff28
--- /dev/null
+++ b/NvCloth/src/ps/PxIntrinsics.h
@@ -0,0 +1,47 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXFOUNDATION_PXINTRINSICS_H
+#define PXFOUNDATION_PXINTRINSICS_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_WINDOWS_FAMILY
+#include "foundation/windows/PxWindowsIntrinsics.h"
+#elif(PX_LINUX || PX_ANDROID || PX_APPLE_FAMILY || PX_PS4)
+#include "foundation/unix/PxUnixIntrinsics.h"
+#elif PX_XBOXONE
+#include "foundation/XboxOne/PxXboxOneIntrinsics.h"
+#elif PX_SWITCH
+#include "foundation/switch/PxSwitchIntrinsics.h"
+#else
+#error "Platform not supported!"
+#endif
+
+#endif // #ifndef PXFOUNDATION_PXINTRINSICS_H
diff --git a/NvCloth/src/ps/android/cpu-features.c b/NvCloth/src/ps/android/cpu-features.c
new file mode 100644
index 0000000..4754c46
--- /dev/null
+++ b/NvCloth/src/ps/android/cpu-features.c
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* ChangeLog for this library:
+ *
+ * NDK r8d: Add android_setCpu().
+ *
+ * NDK r8c: Add new ARM CPU features: VFPv2, VFP_D32, VFP_FP16,
+ *          VFP_FMA, NEON_FMA, IDIV_ARM, IDIV_THUMB2 and iWMMXt.
+ *
+ *          Rewrite the code to parse /proc/self/auxv instead of
+ *          the "Features" field in /proc/cpuinfo.
+ *
+ *          Dynamically allocate the buffer that hold the content
+ *          of /proc/cpuinfo to deal with newer hardware.
+ *
+ * NDK r7c: Fix CPU count computation. The old method only reported the
+ *           number of _active_ CPUs when the library was initialized,
+ *           which could be less than the real total.
+ *
+ * NDK r5: Handle buggy kernels which report a CPU Architecture number of 7
+ *         for an ARMv6 CPU (see below).
+ *
+ *         Handle kernels that only report 'neon', and not 'vfpv3'
+ *         (VFPv3 is mandated by the ARM architecture is Neon is implemented)
+ *
+ *         Handle kernels that only report 'vfpv3d16', and not 'vfpv3'
+ *
+ *         Fix x86 compilation. Report ANDROID_CPU_FAMILY_X86 in
+ *         android_getCpuFamily().
+ *
+ * NDK r4: Initial release
+ */
+
+#if defined(__le32__)
+
+// When users enter this, we should only provide interface and
+// libportable will give the implementations.
+
+#else // !__le32__
+
+#include <sys/system_properties.h>
+#include <pthread.h>
+#include "cpu-features.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+
+static  pthread_once_t     g_once;
+static  int                g_inited;
+static  AndroidCpuFamily   g_cpuFamily;
+static  uint64_t           g_cpuFeatures;
+static  int                g_cpuCount;
+
+#ifdef __arm__
+static  uint32_t           g_cpuIdArm;
+#endif
+
+static const int  android_cpufeatures_debug = 0;
+
+#ifdef __arm__
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_ARM
+#elif defined __i386__
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_X86
+#else
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_UNKNOWN
+#endif
+
+#define  D(...) \
+    do { \
+        if (android_cpufeatures_debug) { \
+            printf(__VA_ARGS__); fflush(stdout); \
+        } \
+    } while (0)
+
+#ifdef __i386__
+static __inline__ void x86_cpuid(int func, int values[4])
+{
+    int a, b, c, d;
+    /* We need to preserve ebx since we're compiling PIC code */
+    /* this means we can't use "=b" for the second output register */
+    __asm__ __volatile__ ( \
+      "push %%ebx\n"
+      "cpuid\n" \
+      "mov %%ebx, %1\n"
+      "pop %%ebx\n"
+      : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+      : "a" (func) \
+    );
+    values[0] = a;
+    values[1] = b;
+    values[2] = c;
+    values[3] = d;
+}
+#endif
+
+/* Get the size of a file by reading it until the end. This is needed
+ * because files under /proc do not always return a valid size when
+ * using fseek(0, SEEK_END) + ftell(). Nor can they be mmap()-ed.
+ */
+static int
+get_file_size(const char* pathname)
+{
+    int fd, ret, result = 0;
+    char buffer[256];
+
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0) {
+        D("Can't open %s: %s\n", pathname, strerror(errno));
+        return -1;
+    }
+
+    for (;;) {
+        int ret = read(fd, buffer, sizeof buffer);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading %s: %s\n", pathname, strerror(errno));
+            break;
+        }
+        if (ret == 0)
+            break;
+
+        result += ret;
+    }
+    close(fd);
+    return result;
+}
+
+/* Read the content of /proc/cpuinfo into a user-provided buffer.
+ * Return the length of the data, or -1 on error. Does *not*
+ * zero-terminate the content. Will not read more
+ * than 'buffsize' bytes.
+ */
+static int
+read_file(const char*  pathname, char*  buffer, size_t  buffsize)
+{
+    int  fd, count;
+
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0) {
+        D("Could not open %s: %s\n", pathname, strerror(errno));
+        return -1;
+    }
+    count = 0;
+    while (count < (int)buffsize) {
+        int ret = read(fd, buffer + count, buffsize - count);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading from %s: %s\n", pathname, strerror(errno));
+            if (count == 0)
+                count = -1;
+            break;
+        }
+        if (ret == 0)
+            break;
+        count += ret;
+    }
+    close(fd);
+    return count;
+}
+
+/* Extract the content of a the first occurence of a given field in
+ * the content of /proc/cpuinfo and return it as a heap-allocated
+ * string that must be freed by the caller.
+ *
+ * Return NULL if not found
+ */
+static char*
+extract_cpuinfo_field(const char* buffer, int buflen, const char* field)
+{
+    int  fieldlen = strlen(field);
+    const char* bufend = buffer + buflen;
+    char* result = NULL;
+    int len, ignore;
+    const char *p, *q;
+
+    /* Look for first field occurence, and ensures it starts the line. */
+    p = buffer;
+    for (;;) {
+        p = memmem(p, bufend-p, field, fieldlen);
+        if (p == NULL)
+            goto EXIT;
+
+        if (p == buffer || p[-1] == '\n')
+            break;
+
+        p += fieldlen;
+    }
+
+    /* Skip to the first column followed by a space */
+    p += fieldlen;
+    p  = memchr(p, ':', bufend-p);
+    if (p == NULL || p[1] != ' ')
+        goto EXIT;
+
+    /* Find the end of the line */
+    p += 2;
+    q = memchr(p, '\n', bufend-p);
+    if (q == NULL)
+        q = bufend;
+
+    /* Copy the line into a heap-allocated buffer */
+    len = q-p;
+    result = malloc(len+1);
+    if (result == NULL)
+        goto EXIT;
+
+    memcpy(result, p, len);
+    result[len] = '\0';
+
+EXIT:
+    return result;
+}
+
+/* Checks that a space-separated list of items contains one given 'item'.
+ * Returns 1 if found, 0 otherwise.
+ */
+static int
+has_list_item(const char* list, const char* item)
+{
+    const char*  p = list;
+    int itemlen = strlen(item);
+
+    if (list == NULL)
+        return 0;
+
+    while (*p) {
+        const char*  q;
+
+        /* skip spaces */
+        while (*p == ' ' || *p == '\t')
+            p++;
+
+        /* find end of current list item */
+        q = p;
+        while (*q && *q != ' ' && *q != '\t')
+            q++;
+
+        if (itemlen == q-p && !memcmp(p, item, itemlen))
+            return 1;
+
+        /* skip to next item */
+        p = q;
+    }
+    return 0;
+}
+
+/* Parse a number starting from 'input', but not going further
+ * than 'limit'. Return the value into '*result'.
+ *
+ * NOTE: Does not skip over leading spaces, or deal with sign characters.
+ * NOTE: Ignores overflows.
+ *
+ * The function returns NULL in case of error (bad format), or the new
+ * position after the decimal number in case of success (which will always
+ * be <= 'limit').
+ */
+static const char*
+parse_number(const char* input, const char* limit, int base, int* result)
+{
+    const char* p = input;
+    int val = 0;
+    while (p < limit) {
+        int d = (*p - '0');
+        if ((unsigned)d >= 10U) {
+            d = (*p - 'a');
+            if ((unsigned)d >= 6U)
+              d = (*p - 'A');
+            if ((unsigned)d >= 6U)
+              break;
+            d += 10;
+        }
+        if (d >= base)
+          break;
+        val = val*base + d;
+        p++;
+    }
+    if (p == input)
+        return NULL;
+
+    *result = val;
+    return p;
+}
+
+static const char*
+parse_decimal(const char* input, const char* limit, int* result)
+{
+    return parse_number(input, limit, 10, result);
+}
+
+static const char*
+parse_hexadecimal(const char* input, const char* limit, int* result)
+{
+    return parse_number(input, limit, 16, result);
+}
+
+/* This small data type is used to represent a CPU list / mask, as read
+ * from sysfs on Linux. See http://www.kernel.org/doc/Documentation/cputopology.txt
+ *
+ * For now, we don't expect more than 32 cores on mobile devices, so keep
+ * everything simple.
+ */
+typedef struct {
+    uint32_t mask;
+} CpuList;
+
+static __inline__ void
+cpulist_init(CpuList* list) {
+    list->mask = 0;
+}
+
+static __inline__ void
+cpulist_and(CpuList* list1, CpuList* list2) {
+    list1->mask &= list2->mask;
+}
+
+static __inline__ void
+cpulist_set(CpuList* list, int index) {
+    if ((unsigned)index < 32) {
+        list->mask |= (uint32_t)(1U << index);
+    }
+}
+
+static __inline__ int
+cpulist_count(CpuList* list) {
+    return __builtin_popcount(list->mask);
+}
+
+/* Parse a textual list of cpus and store the result inside a CpuList object.
+ * Input format is the following:
+ * - comma-separated list of items (no spaces)
+ * - each item is either a single decimal number (cpu index), or a range made
+ *   of two numbers separated by a single dash (-). Ranges are inclusive.
+ *
+ * Examples:   0
+ *             2,4-127,128-143
+ *             0-1
+ */
+static void
+cpulist_parse(CpuList* list, const char* line, int line_len)
+{
+    const char* p = line;
+    const char* end = p + line_len;
+    const char* q;
+
+    /* NOTE: the input line coming from sysfs typically contains a
+     * trailing newline, so take care of it in the code below
+     */
+    while (p < end && *p != '\n')
+    {
+        int val, start_value, end_value;
+
+        /* Find the end of current item, and put it into 'q' */
+        q = memchr(p, ',', end-p);
+        if (q == NULL) {
+            q = end;
+        }
+
+        /* Get first value */
+        p = parse_decimal(p, q, &start_value);
+        if (p == NULL)
+            goto BAD_FORMAT;
+
+        end_value = start_value;
+
+        /* If we're not at the end of the item, expect a dash and
+         * and integer; extract end value.
+         */
+        if (p < q && *p == '-') {
+            p = parse_decimal(p+1, q, &end_value);
+            if (p == NULL)
+                goto BAD_FORMAT;
+        }
+
+        /* Set bits CPU list bits */
+        for (val = start_value; val <= end_value; val++) {
+            cpulist_set(list, val);
+        }
+
+        /* Jump to next item */
+        p = q;
+        if (p < end)
+            p++;
+    }
+
+BAD_FORMAT:
+    ;
+}
+
+/* Read a CPU list from one sysfs file */
+static void
+cpulist_read_from(CpuList* list, const char* filename)
+{
+    char   file[64];
+    int    filelen;
+
+    cpulist_init(list);
+
+    filelen = read_file(filename, file, sizeof file);
+    if (filelen < 0) {
+        D("Could not read %s: %s\n", filename, strerror(errno));
+        return;
+    }
+
+    cpulist_parse(list, file, filelen);
+}
+
+// See <asm/hwcap.h> kernel header.
+#define HWCAP_VFP       (1 << 6)
+#define HWCAP_IWMMXT    (1 << 9)
+#define HWCAP_NEON      (1 << 12)
+#define HWCAP_VFPv3     (1 << 13)
+#define HWCAP_VFPv3D16  (1 << 14)
+#define HWCAP_VFPv4     (1 << 16)
+#define HWCAP_IDIVA     (1 << 17)
+#define HWCAP_IDIVT     (1 << 18)
+
+#define AT_HWCAP 16
+
+#if defined(__arm__)
+/* Compute the ELF HWCAP flags.
+ */
+static uint32_t
+get_elf_hwcap(const char* cpuinfo, int cpuinfo_len)
+{
+  /* IMPORTANT:
+   *   Accessing /proc/self/auxv doesn't work anymore on all
+   *   platform versions. More specifically, when running inside
+   *   a regular application process, most of /proc/self/ will be
+   *   non-readable, including /proc/self/auxv. This doesn't
+   *   happen however if the application is debuggable, or when
+   *   running under the "shell" UID, which is why this was not
+   *   detected appropriately.
+   */
+#if 0
+    uint32_t result = 0;
+    const char filepath[] = "/proc/self/auxv";
+    int fd = open(filepath, O_RDONLY);
+    if (fd < 0) {
+        D("Could not open %s: %s\n", filepath, strerror(errno));
+        return 0;
+    }
+
+    struct { uint32_t tag; uint32_t value; } entry;
+
+    for (;;) {
+        int ret = read(fd, (char*)&entry, sizeof entry);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading %s: %s\n", filepath, strerror(errno));
+            break;
+        }
+        // Detect end of list.
+        if (ret == 0 || (entry.tag == 0 && entry.value == 0))
+          break;
+        if (entry.tag == AT_HWCAP) {
+          result = entry.value;
+          break;
+        }
+    }
+    close(fd);
+    return result;
+#else
+    // Recreate ELF hwcaps by parsing /proc/cpuinfo Features tag.
+    uint32_t hwcaps = 0;
+
+    char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "Features");
+
+    if (cpuFeatures != NULL) {
+        D("Found cpuFeatures = '%s'\n", cpuFeatures);
+
+        if (has_list_item(cpuFeatures, "vfp"))
+            hwcaps |= HWCAP_VFP;
+        if (has_list_item(cpuFeatures, "vfpv3"))
+            hwcaps |= HWCAP_VFPv3;
+        if (has_list_item(cpuFeatures, "vfpv3d16"))
+            hwcaps |= HWCAP_VFPv3D16;
+        if (has_list_item(cpuFeatures, "vfpv4"))
+            hwcaps |= HWCAP_VFPv4;
+        if (has_list_item(cpuFeatures, "neon"))
+            hwcaps |= HWCAP_NEON;
+        if (has_list_item(cpuFeatures, "idiva"))
+            hwcaps |= HWCAP_IDIVA;
+        if (has_list_item(cpuFeatures, "idivt"))
+            hwcaps |= HWCAP_IDIVT;
+        if (has_list_item(cpuFeatures, "idiv"))
+            hwcaps |= HWCAP_IDIVA | HWCAP_IDIVT;
+        if (has_list_item(cpuFeatures, "iwmmxt"))
+            hwcaps |= HWCAP_IWMMXT;
+
+        free(cpuFeatures);
+    }
+    return hwcaps;
+#endif
+}
+#endif  /* __arm__ */
+
+/* Return the number of cpus present on a given device.
+ *
+ * To handle all weird kernel configurations, we need to compute the
+ * intersection of the 'present' and 'possible' CPU lists and count
+ * the result.
+ */
+static int
+get_cpu_count(void)
+{
+    CpuList cpus_present[1];
+    CpuList cpus_possible[1];
+
+    cpulist_read_from(cpus_present, "/sys/devices/system/cpu/present");
+    cpulist_read_from(cpus_possible, "/sys/devices/system/cpu/possible");
+
+    /* Compute the intersection of both sets to get the actual number of
+     * CPU cores that can be used on this device by the kernel.
+     */
+    cpulist_and(cpus_present, cpus_possible);
+
+    return cpulist_count(cpus_present);
+}
+
+static void
+android_cpuInitFamily(void)
+{
+#if defined(__arm__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_ARM;
+#elif defined(__i386__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_X86;
+#elif defined(__mips__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_MIPS;
+#else
+    g_cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+#endif
+}
+
+static void
+android_cpuInit(void)
+{
+    char* cpuinfo = NULL;
+    int   cpuinfo_len;
+
+    android_cpuInitFamily();
+
+    g_cpuFeatures = 0;
+    g_cpuCount    = 1;
+    g_inited      = 1;
+
+    cpuinfo_len = get_file_size("/proc/cpuinfo");
+    if (cpuinfo_len < 0) {
+      D("cpuinfo_len cannot be computed!");
+      return;
+    }
+    cpuinfo = malloc(cpuinfo_len);
+    if (cpuinfo == NULL) {
+      D("cpuinfo buffer could not be allocated");
+      return;
+    }
+    cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, cpuinfo_len);
+    D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
+      cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
+
+    if (cpuinfo_len < 0)  /* should not happen */ {
+        free(cpuinfo);
+        return;
+    }
+
+    /* Count the CPU cores, the value may be 0 for single-core CPUs */
+    g_cpuCount = get_cpu_count();
+    if (g_cpuCount == 0) {
+        g_cpuCount = 1;
+    }
+
+    D("found cpuCount = %d\n", g_cpuCount);
+
+#ifdef __arm__
+    {
+        char*  features = NULL;
+        char*  architecture = NULL;
+
+        /* Extract architecture from the "CPU Architecture" field.
+         * The list is well-known, unlike the the output of
+         * the 'Processor' field which can vary greatly.
+         *
+         * See the definition of the 'proc_arch' array in
+         * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
+         * same file.
+         */
+        char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture");
+
+        if (cpuArch != NULL) {
+            char*  end;
+            long   archNumber;
+            int    hasARMv7 = 0;
+
+            D("found cpuArch = '%s'\n", cpuArch);
+
+            /* read the initial decimal number, ignore the rest */
+            archNumber = strtol(cpuArch, &end, 10);
+
+            /* Here we assume that ARMv8 will be upwards compatible with v7
+             * in the future. Unfortunately, there is no 'Features' field to
+             * indicate that Thumb-2 is supported.
+             */
+            if (end > cpuArch && archNumber >= 7) {
+                hasARMv7 = 1;
+            }
+
+            /* Unfortunately, it seems that certain ARMv6-based CPUs
+             * report an incorrect architecture number of 7!
+             *
+             * See http://code.google.com/p/android/issues/detail?id=10812
+             *
+             * We try to correct this by looking at the 'elf_format'
+             * field reported by the 'Processor' field, which is of the
+             * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
+             * an ARMv6-one.
+             */
+            if (hasARMv7) {
+                char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
+                                                      "Processor");
+                if (cpuProc != NULL) {
+                    D("found cpuProc = '%s'\n", cpuProc);
+                    if (has_list_item(cpuProc, "(v6l)")) {
+                        D("CPU processor and architecture mismatch!!\n");
+                        hasARMv7 = 0;
+                    }
+                    free(cpuProc);
+                }
+            }
+
+            if (hasARMv7) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7;
+            }
+
+            /* The LDREX / STREX instructions are available from ARMv6 */
+            if (archNumber >= 6) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX;
+            }
+
+            free(cpuArch);
+        }
+
+        /* Extract the list of CPU features from ELF hwcaps */
+        uint32_t hwcaps = get_elf_hwcap(cpuinfo, cpuinfo_len);
+
+        if (hwcaps != 0) {
+            int has_vfp = (hwcaps & HWCAP_VFP);
+            int has_vfpv3 = (hwcaps & HWCAP_VFPv3);
+            int has_vfpv3d16 = (hwcaps & HWCAP_VFPv3D16);
+            int has_vfpv4 = (hwcaps & HWCAP_VFPv4);
+            int has_neon = (hwcaps & HWCAP_NEON);
+            int has_idiva = (hwcaps & HWCAP_IDIVA);
+            int has_idivt = (hwcaps & HWCAP_IDIVT);
+            int has_iwmmxt = (hwcaps & HWCAP_IWMMXT);
+
+            // The kernel does a poor job at ensuring consistency when
+            // describing CPU features. So lots of guessing is needed.
+
+            // 'vfpv4' implies VFPv3|VFP_FMA|FP16
+            if (has_vfpv4)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3    |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_FP16 |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_FMA;
+
+            // 'vfpv3' or 'vfpv3d16' imply VFPv3. Note that unlike GCC,
+            // a value of 'vfpv3' doesn't necessarily mean that the D32
+            // feature is present, so be conservative. All CPUs in the
+            // field that support D32 also support NEON, so this should
+            // not be a problem in practice.
+            if (has_vfpv3 || has_vfpv3d16)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
+
+            // 'vfp' is super ambiguous. Depending on the kernel, it can
+            // either mean VFPv2 or VFPv3. Make it depend on ARMv7.
+            if (has_vfp) {
+              if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7)
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
+              else
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2;
+            }
+
+            // Neon implies VFPv3|D32, and if vfpv4 is detected, NEON_FMA
+            if (has_neon) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 |
+                                 ANDROID_CPU_ARM_FEATURE_NEON |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_D32;
+              if (has_vfpv4)
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA;
+            }
+
+            // VFPv3 implies VFPv2 and ARMv7
+            if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2 |
+                                 ANDROID_CPU_ARM_FEATURE_ARMv7;
+
+            if (has_idiva)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM;
+            if (has_idivt)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2;
+
+            if (has_iwmmxt)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt;
+        }
+
+        /* Extract the cpuid value from various fields */
+        // The CPUID value is broken up in several entries in /proc/cpuinfo.
+        // This table is used to rebuild it from the entries.
+        static const struct CpuIdEntry {
+            const char* field;
+            char        format;
+            char        bit_lshift;
+            char        bit_length;
+        } cpu_id_entries[] = {
+            { "CPU implementer", 'x', 24, 8 },
+            { "CPU variant", 'x', 20, 4 },
+            { "CPU part", 'x', 4, 12 },
+            { "CPU revision", 'd', 0, 4 },
+        };
+        size_t i;
+        D("Parsing /proc/cpuinfo to recover CPUID\n");
+        for (i = 0;
+             i < sizeof(cpu_id_entries)/sizeof(cpu_id_entries[0]);
+             ++i) {
+            const struct CpuIdEntry* entry = &cpu_id_entries[i];
+            char* value = extract_cpuinfo_field(cpuinfo,
+                                                cpuinfo_len,
+                                                entry->field);
+            if (value == NULL)
+                continue;
+
+            D("field=%s value='%s'\n", entry->field, value);
+            char* value_end = value + strlen(value);
+            int val = 0;
+            const char* start = value;
+            const char* p;
+            if (value[0] == '0' && (value[1] == 'x' || value[1] == 'X')) {
+              start += 2;
+              p = parse_hexadecimal(start, value_end, &val);
+            } else if (entry->format == 'x')
+              p = parse_hexadecimal(value, value_end, &val);
+            else
+              p = parse_decimal(value, value_end, &val);
+
+            if (p > (const char*)start) {
+              val &= ((1 << entry->bit_length)-1);
+              val <<= entry->bit_lshift;
+              g_cpuIdArm |= (uint32_t) val;
+            }
+
+            free(value);
+        }
+
+        // Handle kernel configuration bugs that prevent the correct
+        // reporting of CPU features.
+        static const struct CpuFix {
+            uint32_t  cpuid;
+            uint64_t  or_flags;
+        } cpu_fixes[] = {
+            /* The Nexus 4 (Qualcomm Krait) kernel configuration
+             * forgets to report IDIV support. */
+            { 0x510006f2, ANDROID_CPU_ARM_FEATURE_IDIV_ARM |
+                          ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 },
+            { 0x510006f3, ANDROID_CPU_ARM_FEATURE_IDIV_ARM |
+                          ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 },
+        };
+        size_t n;
+        for (n = 0; n < sizeof(cpu_fixes)/sizeof(cpu_fixes[0]); ++n) {
+            const struct CpuFix* entry = &cpu_fixes[n];
+
+            if (g_cpuIdArm == entry->cpuid)
+                g_cpuFeatures |= entry->or_flags;
+        }
+
+    }
+#endif /* __arm__ */
+
+#ifdef __i386__
+    int regs[4];
+
+/* According to http://en.wikipedia.org/wiki/CPUID */
+#define VENDOR_INTEL_b  0x756e6547
+#define VENDOR_INTEL_c  0x6c65746e
+#define VENDOR_INTEL_d  0x49656e69
+
+    x86_cpuid(0, regs);
+    int vendorIsIntel = (regs[1] == VENDOR_INTEL_b &&
+                         regs[2] == VENDOR_INTEL_c &&
+                         regs[3] == VENDOR_INTEL_d);
+
+    x86_cpuid(1, regs);
+    if ((regs[2] & (1 << 9)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3;
+    }
+    if ((regs[2] & (1 << 23)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT;
+    }
+    if (vendorIsIntel && (regs[2] & (1 << 22)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE;
+    }
+#endif
+
+    free(cpuinfo);
+}
+
+
+AndroidCpuFamily
+android_getCpuFamily(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuFamily;
+}
+
+
+uint64_t
+android_getCpuFeatures(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuFeatures;
+}
+
+
+int
+android_getCpuCount(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuCount;
+}
+
+static void
+android_cpuInitDummy(void)
+{
+    g_inited = 1;
+}
+
+int
+android_setCpu(int cpu_count, uint64_t cpu_features)
+{
+    /* Fail if the library was already initialized. */
+    if (g_inited)
+        return 0;
+
+    android_cpuInitFamily();
+    g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count);
+    g_cpuFeatures = cpu_features;
+    pthread_once(&g_once, android_cpuInitDummy);
+
+    return 1;
+}
+
+#ifdef __arm__
+uint32_t
+android_getCpuIdArm(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuIdArm;
+}
+
+int
+android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id)
+{
+    if (!android_setCpu(cpu_count, cpu_features))
+        return 0;
+
+    g_cpuIdArm = cpu_id;
+    return 1;
+}
+#endif  /* __arm__ */
+
+/*
+ * Technical note: Making sense of ARM's FPU architecture versions.
+ *
+ * FPA was ARM's first attempt at an FPU architecture. There is no Android
+ * device that actually uses it since this technology was already obsolete
+ * when the project started. If you see references to FPA instructions
+ * somewhere, you can be sure that this doesn't apply to Android at all.
+ *
+ * FPA was followed by "VFP", soon renamed "VFPv1" due to the emergence of
+ * new versions / additions to it. ARM considers this obsolete right now,
+ * and no known Android device implements it either.
+ *
+ * VFPv2 added a few instructions to VFPv1, and is an *optional* extension
+ * supported by some ARMv5TE, ARMv6 and ARMv6T2 CPUs. Note that a device
+ * supporting the 'armeabi' ABI doesn't necessarily support these.
+ *
+ * VFPv3-D16 adds a few instructions on top of VFPv2 and is typically used
+ * on ARMv7-A CPUs which implement a FPU. Note that it is also mandated
+ * by the Android 'armeabi-v7a' ABI. The -D16 suffix in its name means
+ * that it provides 16 double-precision FPU registers (d0-d15) and 32
+ * single-precision ones (s0-s31) which happen to be mapped to the same
+ * register banks.
+ *
+ * VFPv3-D32 is the name of an extension to VFPv3-D16 that provides 16
+ * additional double precision registers (d16-d31). Note that there are
+ * still only 32 single precision registers.
+ *
+ * VFPv3xD is a *subset* of VFPv3-D16 that only provides single-precision
+ * registers. It is only used on ARMv7-M (i.e. on micro-controllers) which
+ * are not supported by Android. Note that it is not compatible with VFPv2.
+ *
+ * NOTE: The term 'VFPv3' usually designate either VFPv3-D16 or VFPv3-D32
+ *       depending on context. For example GCC uses it for VFPv3-D32, but
+ *       the Linux kernel code uses it for VFPv3-D16 (especially in
+ *       /proc/cpuinfo). Always try to use the full designation when
+ *       possible.
+ *
+ * NEON, a.k.a. "ARM Advanced SIMD" is an extension that provides
+ * instructions to perform parallel computations on vectors of 8, 16,
+ * 32, 64 and 128 bit quantities. NEON requires VFPv32-D32 since all
+ * NEON registers are also mapped to the same register banks.
+ *
+ * VFPv4-D16, adds a few instructions on top of VFPv3-D16 in order to
+ * perform fused multiply-accumulate on VFP registers, as well as
+ * half-precision (16-bit) conversion operations.
+ *
+ * VFPv4-D32 is VFPv4-D16 with 32, instead of 16, FPU double precision
+ * registers.
+ *
+ * VPFv4-NEON is VFPv4-D32 with NEON instructions. It also adds fused
+ * multiply-accumulate instructions that work on the NEON registers.
+ *
+ * NOTE: Similarly, "VFPv4" might either reference VFPv4-D16 or VFPv4-D32
+ *       depending on context.
+ *
+ * The following information was determined by scanning the binutils-2.22
+ * sources:
+ *
+ * Basic VFP instruction subsets:
+ *
+ * #define FPU_VFP_EXT_V1xD 0x08000000     // Base VFP instruction set.
+ * #define FPU_VFP_EXT_V1   0x04000000     // Double-precision insns.
+ * #define FPU_VFP_EXT_V2   0x02000000     // ARM10E VFPr1.
+ * #define FPU_VFP_EXT_V3xD 0x01000000     // VFPv3 single-precision.
+ * #define FPU_VFP_EXT_V3   0x00800000     // VFPv3 double-precision.
+ * #define FPU_NEON_EXT_V1  0x00400000     // Neon (SIMD) insns.
+ * #define FPU_VFP_EXT_D32  0x00200000     // Registers D16-D31.
+ * #define FPU_VFP_EXT_FP16 0x00100000     // Half-precision extensions.
+ * #define FPU_NEON_EXT_FMA 0x00080000     // Neon fused multiply-add
+ * #define FPU_VFP_EXT_FMA  0x00040000     // VFP fused multiply-add
+ *
+ * FPU types (excluding NEON)
+ *
+ * FPU_VFP_V1xD (EXT_V1xD)
+ *    |
+ *    +--------------------------+
+ *    |                          |
+ * FPU_VFP_V1 (+EXT_V1)       FPU_VFP_V3xD (+EXT_V2+EXT_V3xD)
+ *    |                          |
+ *    |                          |
+ * FPU_VFP_V2 (+EXT_V2)       FPU_VFP_V4_SP_D16 (+EXT_FP16+EXT_FMA)
+ *    |
+ * FPU_VFP_V3D16 (+EXT_Vx3D+EXT_V3)
+ *    |
+ *    +--------------------------+
+ *    |                          |
+ * FPU_VFP_V3 (+EXT_D32)     FPU_VFP_V4D16 (+EXT_FP16+EXT_FMA)
+ *    |                          |
+ *    |                      FPU_VFP_V4 (+EXT_D32)
+ *    |
+ * FPU_VFP_HARD (+EXT_FMA+NEON_EXT_FMA)
+ *
+ * VFP architectures:
+ *
+ * ARCH_VFP_V1xD  (EXT_V1xD)
+ *   |
+ *   +------------------+
+ *   |                  |
+ *   |             ARCH_VFP_V3xD (+EXT_V2+EXT_V3xD)
+ *   |                  |
+ *   |             ARCH_VFP_V3xD_FP16 (+EXT_FP16)
+ *   |                  |
+ *   |             ARCH_VFP_V4_SP_D16 (+EXT_FMA)
+ *   |
+ * ARCH_VFP_V1 (+EXT_V1)
+ *   |
+ * ARCH_VFP_V2 (+EXT_V2)
+ *   |
+ * ARCH_VFP_V3D16 (+EXT_V3xD+EXT_V3)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
+ *   |                   |
+ *   |         ARCH_VFP_V4 (+EXT_D32)
+ *   |                   |
+ *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
+ *   |
+ * ARCH_VFP_V3 (+EXT_D32)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
+ *   |
+ * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
+ *   |
+ * ARCH_NEON_FP16 (+EXT_FP16)
+ *
+ * -fpu=<name> values and their correspondance with FPU architectures above:
+ *
+ *   {"vfp",               FPU_ARCH_VFP_V2},
+ *   {"vfp9",              FPU_ARCH_VFP_V2},
+ *   {"vfp3",              FPU_ARCH_VFP_V3}, // For backwards compatbility.
+ *   {"vfp10",             FPU_ARCH_VFP_V2},
+ *   {"vfp10-r0",          FPU_ARCH_VFP_V1},
+ *   {"vfpxd",             FPU_ARCH_VFP_V1xD},
+ *   {"vfpv2",             FPU_ARCH_VFP_V2},
+ *   {"vfpv3",             FPU_ARCH_VFP_V3},
+ *   {"vfpv3-fp16",        FPU_ARCH_VFP_V3_FP16},
+ *   {"vfpv3-d16",         FPU_ARCH_VFP_V3D16},
+ *   {"vfpv3-d16-fp16",    FPU_ARCH_VFP_V3D16_FP16},
+ *   {"vfpv3xd",           FPU_ARCH_VFP_V3xD},
+ *   {"vfpv3xd-fp16",      FPU_ARCH_VFP_V3xD_FP16},
+ *   {"neon",              FPU_ARCH_VFP_V3_PLUS_NEON_V1},
+ *   {"neon-fp16",         FPU_ARCH_NEON_FP16},
+ *   {"vfpv4",             FPU_ARCH_VFP_V4},
+ *   {"vfpv4-d16",         FPU_ARCH_VFP_V4D16},
+ *   {"fpv4-sp-d16",       FPU_ARCH_VFP_V4_SP_D16},
+ *   {"neon-vfpv4",        FPU_ARCH_NEON_VFP_V4},
+ *
+ *
+ * Simplified diagram that only includes FPUs supported by Android:
+ * Only ARCH_VFP_V3D16 is actually mandated by the armeabi-v7a ABI,
+ * all others are optional and must be probed at runtime.
+ *
+ * ARCH_VFP_V3D16 (EXT_V1xD+EXT_V1+EXT_V2+EXT_V3xD+EXT_V3)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
+ *   |                   |
+ *   |         ARCH_VFP_V4 (+EXT_D32)
+ *   |                   |
+ *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
+ *   |
+ * ARCH_VFP_V3 (+EXT_D32)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
+ *   |
+ * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
+ *   |
+ * ARCH_NEON_FP16 (+EXT_FP16)
+ *
+ */
+
+#endif // defined(__le32__)
diff --git a/NvCloth/src/ps/android/cpu-features.h b/NvCloth/src/ps/android/cpu-features.h
new file mode 100644
index 0000000..89f7666
--- /dev/null
+++ b/NvCloth/src/ps/android/cpu-features.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef CPU_FEATURES_H
+#define CPU_FEATURES_H
+
+#include <sys/cdefs.h>
+#include <stdint.h>
+
+__BEGIN_DECLS
+
+typedef enum
+{
+	ANDROID_CPU_FAMILY_UNKNOWN = 0,
+	ANDROID_CPU_FAMILY_ARM,
+	ANDROID_CPU_FAMILY_X86,
+	ANDROID_CPU_FAMILY_MIPS,
+	ANDROID_CPU_FAMILY_MAX /* do not remove */
+} AndroidCpuFamily;
+
+/* Return family of the device's CPU */
+extern AndroidCpuFamily android_getCpuFamily(void);
+
+/* The list of feature flags for ARM CPUs that can be recognized by the
+ * library. Value details are:
+ *
+ *   VFPv2:
+ *     CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs
+ *     support these instructions. VFPv2 is a subset of VFPv3 so this will
+ *     be set whenever VFPv3 is set too.
+ *
+ *   ARMv7:
+ *     CPU supports the ARMv7-A basic instruction set.
+ *     This feature is mandated by the 'armeabi-v7a' ABI.
+ *
+ *   VFPv3:
+ *     CPU supports the VFPv3-D16 instruction set, providing hardware FPU
+ *     support for single and double precision floating point registers.
+ *     Note that only 16 FPU registers are available by default, unless
+ *     the D32 bit is set too. This feature is also mandated by the
+ *     'armeabi-v7a' ABI.
+ *
+ *   VFP_D32:
+ *     CPU VFP optional extension that provides 32 FPU registers,
+ *     instead of 16. Note that ARM mandates this feature is the 'NEON'
+ *     feature is implemented by the CPU.
+ *
+ *   NEON:
+ *     CPU FPU supports "ARM Advanced SIMD" instructions, also known as
+ *     NEON. Note that this mandates the VFP_D32 feature as well, per the
+ *     ARM Architecture specification.
+ *
+ *   VFP_FP16:
+ *     Half-width floating precision VFP extension. If set, the CPU
+ *     supports instructions to perform floating-point operations on
+ *     16-bit registers. This is part of the VFPv4 specification, but
+ *     not mandated by any Android ABI.
+ *
+ *   VFP_FMA:
+ *     Fused multiply-accumulate VFP instructions extension. Also part of
+ *     the VFPv4 specification, but not mandated by any Android ABI.
+ *
+ *   NEON_FMA:
+ *     Fused multiply-accumulate NEON instructions extension. Optional
+ *     extension from the VFPv4 specification, but not mandated by any
+ *     Android ABI.
+ *
+ *   IDIV_ARM:
+ *     Integer division available in ARM mode. Only available
+ *     on recent CPUs (e.g. Cortex-A15).
+ *
+ *   IDIV_THUMB2:
+ *     Integer division available in Thumb-2 mode. Only available
+ *     on recent CPUs (e.g. Cortex-A15).
+ *
+ *   iWMMXt:
+ *     Optional extension that adds MMX registers and operations to an
+ *     ARM CPU. This is only available on a few XScale-based CPU designs
+ *     sold by Marvell. Pretty rare in practice.
+ *
+ * If you want to tell the compiler to generate code that targets one of
+ * the feature set above, you should probably use one of the following
+ * flags (for more details, see technical note at the end of this file):
+ *
+ *   -mfpu=vfp
+ *   -mfpu=vfpv2
+ *     These are equivalent and tell GCC to use VFPv2 instructions for
+ *     floating-point operations. Use this if you want your code to
+ *     run on *some* ARMv6 devices, and any ARMv7-A device supported
+ *     by Android.
+ *
+ *     Generated code requires VFPv2 feature.
+ *
+ *   -mfpu=vfpv3-d16
+ *     Tell GCC to use VFPv3 instructions (using only 16 FPU registers).
+ *     This should be generic code that runs on any CPU that supports the
+ *     'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this.
+ *
+ *     Generated code requires VFPv3 feature.
+ *
+ *   -mfpu=vfpv3
+ *     Tell GCC to use VFPv3 instructions with 32 FPU registers.
+ *     Generated code requires VFPv3|VFP_D32 features.
+ *
+ *   -mfpu=neon
+ *     Tell GCC to use VFPv3 instructions with 32 FPU registers, and
+ *     also support NEON intrinsics (see <arm_neon.h>).
+ *     Generated code requires VFPv3|VFP_D32|NEON features.
+ *
+ *   -mfpu=vfpv4-d16
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA features.
+ *
+ *   -mfpu=vfpv4
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features.
+ *
+ *   -mfpu=neon-vfpv4
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA
+ *     features.
+ *
+ *   -mcpu=cortex-a7
+ *   -mcpu=cortex-a15
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|
+ *                             NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2
+ *     This flag implies -mfpu=neon-vfpv4.
+ *
+ *   -mcpu=iwmmxt
+ *     Allows the use of iWMMXt instrinsics with GCC.
+ */
+enum
+{
+	ANDROID_CPU_ARM_FEATURE_ARMv7       = (1 << 0),
+	ANDROID_CPU_ARM_FEATURE_VFPv3       = (1 << 1),
+	ANDROID_CPU_ARM_FEATURE_NEON        = (1 << 2),
+	ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3),
+	ANDROID_CPU_ARM_FEATURE_VFPv2       = (1 << 4),
+	ANDROID_CPU_ARM_FEATURE_VFP_D32     = (1 << 5),
+	ANDROID_CPU_ARM_FEATURE_VFP_FP16    = (1 << 6),
+	ANDROID_CPU_ARM_FEATURE_VFP_FMA     = (1 << 7),
+	ANDROID_CPU_ARM_FEATURE_NEON_FMA    = (1 << 8),
+	ANDROID_CPU_ARM_FEATURE_IDIV_ARM    = (1 << 9),
+	ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10),
+	ANDROID_CPU_ARM_FEATURE_iWMMXt      = (1 << 11),
+};
+
+enum
+{
+	ANDROID_CPU_X86_FEATURE_SSSE3  = (1 << 0),
+	ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1),
+	ANDROID_CPU_X86_FEATURE_MOVBE  = (1 << 2),
+};
+
+extern uint64_t android_getCpuFeatures(void);
+
+/* Return the number of CPU cores detected on this device. */
+extern int android_getCpuCount(void);
+
+/* The following is used to force the CPU count and features
+ * mask in sandboxed processes. Under 4.1 and higher, these processes
+ * cannot access /proc, which is the only way to get information from
+ * the kernel about the current hardware (at least on ARM).
+ *
+ * It _must_ be called only once, and before any android_getCpuXXX
+ * function, any other case will fail.
+ *
+ * This function return 1 on success, and 0 on failure.
+ */
+extern int android_setCpu(int cpu_count, uint64_t cpu_features);
+
+#ifdef __arm__
+/* Retrieve the ARM 32-bit CPUID value from the kernel.
+ * Note that this cannot work on sandboxed processes under 4.1 and
+ * higher, unless you called android_setCpuArm() before.
+ */
+extern uint32_t android_getCpuIdArm(void);
+
+/* An ARM-specific variant of android_setCpu() that also allows you
+ * to set the ARM CPUID field.
+ */
+extern int android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id);
+#endif
+
+__END_DECLS
+
+#endif /* CPU_FEATURES_H */
diff --git a/NvCloth/src/ps/unix/PsUnixAtomic.cpp b/NvCloth/src/ps/unix/PsUnixAtomic.cpp
new file mode 100644
index 0000000..376f795
--- /dev/null
+++ b/NvCloth/src/ps/unix/PsUnixAtomic.cpp
@@ -0,0 +1,102 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+#include "NvCloth/ps/Ps.h"
+#include "NvCloth/ps/PsAtomic.h"
+
+#if ! PX_EMSCRIPTEN
+#define PAUSE() asm("nop")
+#else
+#define PAUSE()
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp)
+{
+	return __sync_val_compare_and_swap(const_cast<void**>(dest), comp, exch);
+}
+
+int32_t atomicCompareExchange(volatile int32_t* dest, int32_t exch, int32_t comp)
+{
+	return __sync_val_compare_and_swap(dest, comp, exch);
+}
+
+int32_t atomicIncrement(volatile int32_t* val)
+{
+	return __sync_add_and_fetch(val, 1);
+}
+
+int32_t atomicDecrement(volatile int32_t* val)
+{
+	return __sync_sub_and_fetch(val, 1);
+}
+
+int32_t atomicAdd(volatile int32_t* val, int32_t delta)
+{
+	return __sync_add_and_fetch(val, delta);
+}
+
+int32_t atomicMax(volatile int32_t* val, int32_t val2)
+{
+	int32_t oldVal, newVal;
+
+	do
+	{
+		PAUSE();
+		oldVal = *val;
+
+		if(val2 > oldVal)
+			newVal = val2;
+		else
+			newVal = oldVal;
+
+	} while(atomicCompareExchange(val, newVal, oldVal) != oldVal);
+
+	return *val;
+}
+
+int32_t atomicExchange(volatile int32_t* val, int32_t val2)
+{
+	int32_t newVal, oldVal;
+
+	do
+	{
+		PAUSE();
+		oldVal = *val;
+		newVal = val2;
+	} while(atomicCompareExchange(val, newVal, oldVal) != oldVal);
+
+	return oldVal;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/NvCloth/src/ps/unix/PsUnixFPU.h b/NvCloth/src/ps/unix/PsUnixFPU.h
new file mode 100644
index 0000000..edd5522
--- /dev/null
+++ b/NvCloth/src/ps/unix/PsUnixFPU.h
@@ -0,0 +1,69 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXFPU_H
+#define PSFOUNDATION_PSUNIXFPU_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_LINUX || PX_PS4 || PX_OSX
+
+#if PX_X86 || PX_X64
+#if PX_EMSCRIPTEN
+#include <emmintrin.h>
+#endif
+#include <xmmintrin.h>
+#elif PX_NEON
+#include <arm_neon.h>
+#endif
+
+
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64)
+	mControlWord = _mm_getcsr();
+	// set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6))
+	_mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6));
+#endif
+}
+
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64)
+	// restore control word and clear exception flags
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	_mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK);
+#endif
+}
+
+#else
+#error No SIMD implementation for this unix platform.
+#endif // PX_LINUX || PX_PS4 || PX_OSX
+
+#endif // #ifndef PSFOUNDATION_PSUNIXFPU_H
diff --git a/NvCloth/src/ps/unix/PsUnixMutex.cpp b/NvCloth/src/ps/unix/PsUnixMutex.cpp
new file mode 100644
index 0000000..23b6549
--- /dev/null
+++ b/NvCloth/src/ps/unix/PsUnixMutex.cpp
@@ -0,0 +1,170 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxErrorCallback.h"
+
+#include "Ps.h"
+#include "PsFoundation.h"
+#include "PsUserAllocated.h"
+#include "PsMutex.h"
+#include "PsAtomic.h"
+#include "PsThread.h"
+
+#include <pthread.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+struct MutexUnixImpl
+{
+	pthread_mutex_t lock;
+	Thread::Id owner;
+};
+
+MutexUnixImpl* getMutex(MutexImpl* impl)
+{
+	return reinterpret_cast<MutexUnixImpl*>(impl);
+}
+}
+
+MutexImpl::MutexImpl()
+{
+	pthread_mutexattr_t attr;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+#if !PX_ANDROID
+	// mimic default windows behavior where applicable
+	pthread_mutexattr_setprotocol(&attr, PTHREAD_PRIO_INHERIT);
+#endif
+	pthread_mutex_init(&getMutex(this)->lock, &attr);
+	pthread_mutexattr_destroy(&attr);
+}
+
+MutexImpl::~MutexImpl()
+{
+	pthread_mutex_destroy(&getMutex(this)->lock);
+}
+
+void MutexImpl::lock()
+{
+	int err = pthread_mutex_lock(&getMutex(this)->lock);
+	PX_ASSERT(!err);
+	PX_UNUSED(err);
+
+#if PX_DEBUG
+	getMutex(this)->owner = Thread::getId();
+#endif
+}
+
+bool MutexImpl::trylock()
+{
+	bool success = !pthread_mutex_trylock(&getMutex(this)->lock);
+#if PX_DEBUG
+	if(success)
+		getMutex(this)->owner = Thread::getId();
+#endif
+	return success;
+}
+
+void MutexImpl::unlock()
+{
+#if PX_DEBUG
+	if(getMutex(this)->owner != Thread::getId())
+	{
+		shdfnd::getFoundation().error(PxErrorCode::eINVALID_OPERATION, __FILE__, __LINE__,
+		                              "Mutex must be unlocked only by thread that has already acquired lock");
+		return;
+	}
+#endif
+
+	int err = pthread_mutex_unlock(&getMutex(this)->lock);
+	PX_ASSERT(!err);
+	PX_UNUSED(err);
+}
+
+uint32_t MutexImpl::getSize()
+{
+	return sizeof(MutexUnixImpl);
+}
+
+class ReadWriteLockImpl
+{
+  public:
+	Mutex mutex;
+	volatile int readerCounter;
+};
+
+ReadWriteLock::ReadWriteLock()
+{
+	mImpl = reinterpret_cast<ReadWriteLockImpl*>(PX_ALLOC(sizeof(ReadWriteLockImpl), "ReadWriteLockImpl"));
+	PX_PLACEMENT_NEW(mImpl, ReadWriteLockImpl);
+
+	mImpl->readerCounter = 0;
+}
+
+ReadWriteLock::~ReadWriteLock()
+{
+	mImpl->~ReadWriteLockImpl();
+	PX_FREE(mImpl);
+}
+
+void ReadWriteLock::lockReader()
+{
+	mImpl->mutex.lock();
+
+	atomicIncrement(&mImpl->readerCounter);
+
+	mImpl->mutex.unlock();
+}
+
+void ReadWriteLock::lockWriter()
+{
+	mImpl->mutex.lock();
+
+	while(mImpl->readerCounter != 0)
+		;
+}
+
+void ReadWriteLock::unlockReader()
+{
+	atomicDecrement(&mImpl->readerCounter);
+}
+
+void ReadWriteLock::unlockWriter()
+{
+	mImpl->mutex.unlock();
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/NvCloth/src/ps/windows/PsWindowsAtomic.cpp b/NvCloth/src/ps/windows/PsWindowsAtomic.cpp
new file mode 100644
index 0000000..c78728f
--- /dev/null
+++ b/NvCloth/src/ps/windows/PsWindowsAtomic.cpp
@@ -0,0 +1,96 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsWindowsInclude.h"
+#include "NvCloth/ps/PsAtomic.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+int32_t atomicExchange(volatile int32_t* val, int32_t val2)
+{
+	return (int32_t)InterlockedExchange((volatile LONG*)val, (LONG)val2);
+}
+
+int32_t atomicCompareExchange(volatile int32_t* dest, int32_t exch, int32_t comp)
+{
+	return (int32_t)InterlockedCompareExchange((volatile LONG*)dest, exch, comp);
+}
+
+void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp)
+{
+	return InterlockedCompareExchangePointer((volatile PVOID*)dest, exch, comp);
+}
+
+int32_t atomicIncrement(volatile int32_t* val)
+{
+	return (int32_t)InterlockedIncrement((volatile LONG*)val);
+}
+
+int32_t atomicDecrement(volatile int32_t* val)
+{
+	return (int32_t)InterlockedDecrement((volatile LONG*)val);
+}
+
+int32_t atomicAdd(volatile int32_t* val, int32_t delta)
+{
+	LONG newValue, oldValue;
+	do
+	{
+		oldValue = *val;
+		newValue = oldValue + delta;
+	} while(InterlockedCompareExchange((volatile LONG*)val, newValue, oldValue) != oldValue);
+
+	return newValue;
+}
+
+int32_t atomicMax(volatile int32_t* val, int32_t val2)
+{
+	// Could do this more efficiently in asm...
+
+	LONG newValue, oldValue;
+
+	do
+	{
+		oldValue = *val;
+
+		if(val2 > oldValue)
+			newValue = val2;
+		else
+			newValue = oldValue;
+
+	} while(InterlockedCompareExchange((volatile LONG*)val, newValue, oldValue) != oldValue);
+
+	return newValue;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/NvCloth/src/ps/windows/PsWindowsFPU.h b/NvCloth/src/ps/windows/PsWindowsFPU.h
new file mode 100644
index 0000000..d85e531
--- /dev/null
+++ b/NvCloth/src/ps/windows/PsWindowsFPU.h
@@ -0,0 +1,51 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSFPU_H
+#define PSFOUNDATION_PSWINDOWSFPU_H
+
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+#if !PX_ARM
+	mControlWord = _mm_getcsr();
+	// set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6))
+	_mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6));
+#endif
+}
+
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+#if !PX_ARM
+	// restore control word and clear any exception flags
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	_mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK);
+#endif
+}
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSFPU_H
diff --git a/NvCloth/src/ps/windows/PsWindowsInclude.h b/NvCloth/src/ps/windows/PsWindowsInclude.h
new file mode 100644
index 0000000..066abe7
--- /dev/null
+++ b/NvCloth/src/ps/windows/PsWindowsInclude.h
@@ -0,0 +1,96 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H
+#define PSFOUNDATION_PSWINDOWSINCLUDE_H
+
+#include "NvCloth/ps/Ps.h"
+
+#ifndef _WIN32
+#error "This file should only be included by Windows builds!!"
+#endif
+
+#ifdef _WINDOWS_ // windows already included
+#error "Only include windows.h through this file!!"
+#endif
+
+// We only support >= Windows XP, and we need this for critical section and
+#define _WIN32_WINNT 0x0501
+
+// turn off as much as we can for windows. All we really need is the thread functions(critical sections/Interlocked*
+// etc)
+#define NOGDICAPMASKS
+#define NOVIRTUALKEYCODES
+#define NOWINMESSAGES
+#define NOWINSTYLES
+#define NOSYSMETRICS
+#define NOMENUS
+#define NOICONS
+#define NOKEYSTATES
+#define NOSYSCOMMANDS
+#define NORASTEROPS
+#define NOSHOWWINDOW
+#define NOATOM
+#define NOCLIPBOARD
+#define NOCOLOR
+#define NOCTLMGR
+#define NODRAWTEXT
+#define NOGDI
+#define NOMB
+#define NOMEMMGR
+#define NOMETAFILE
+#define NOMINMAX
+#define NOOPENFILE
+#define NOSCROLL
+#define NOSERVICE
+#define NOSOUND
+#define NOTEXTMETRIC
+#define NOWH
+#define NOWINOFFSETS
+#define NOCOMM
+#define NOKANJI
+#define NOHELP
+#define NOPROFILER
+#define NODEFERWINDOWPOS
+#define NOMCX
+#define WIN32_LEAN_AND_MEAN
+#define NOUSER
+#define NONLS
+#define NOMSG
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#include <windows.h>
+#pragma warning(pop)
+
+#if PX_SSE2
+#include <xmmintrin.h>
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H
diff --git a/NvCloth/src/scalar/SwCollisionHelpers.h b/NvCloth/src/scalar/SwCollisionHelpers.h
index c86d939..cad06e3 100644
--- a/NvCloth/src/scalar/SwCollisionHelpers.h
+++ b/NvCloth/src/scalar/SwCollisionHelpers.h
@@ -29,7 +29,7 @@
 
 #pragma once
 
-#include "PsMathUtils.h"
+#include "NvCloth/ps/PsMathUtils.h"
 
 namespace nv
 {