NvCloth library v1.0.0

author: mtamis <[email protected]> 2017-02-15 16:06:25 +0100
committer: mtamis <[email protected]> 2017-02-15 16:06:25 +0100
commit: 85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch)
tree: 45f1bb20a45a300d1fef107e436cac95602a0e57 /NvCloth/src/cuda/CuSolverKernel.cu
download: nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz
nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip
1 files changed, 1443 insertions, 0 deletions
diff --git a/NvCloth/src/cuda/CuSolverKernel.cu b/NvCloth/src/cuda/CuSolverKernel.cu
new file mode 100644
index 0000000..3517193
--- /dev/null
+++ b/NvCloth/src/cuda/CuSolverKernel.cu
@@ -0,0 +1,1443 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "CuSolverKernel.h"
+#include "CuClothData.h"
+#include "CuPhaseConfig.h"
+
+#include <new> // placement new
+
+/*
+	For detailed comments about the algorithm check SwSolverKernel.cpp (or the documentation)
+	The CPU implementation is generally easier to read, and comments are not duplicated in other implementations.
+	Only CUDA implementation specific comments are left in this implementation.
+*/
+
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.192092896e-07F
+#endif
+#ifndef FLT_MAX
+#define FLT_MAX 3.402823466e+38F
+#endif
+
+// Converting pointers to shared/global addresses is faster than doing generic loads on SM50
+#define CONVERT_ADDRESSES (__CUDA_ARCH__ >= 500)
+
+#if !defined(_WIN64) && !defined(__x86_64__)
+#define POINTER_CONSTRAINT "r"
+#define POINTER_TYPE "u32"
+#else
+#define POINTER_CONSTRAINT "l"
+#define POINTER_TYPE "u64"
+#endif
+
+#ifndef __CUDA_ARCH__
+#define assert(x)
+#endif
+
+extern "C" {
+extern _CRTIMP __host__ __device__ int __cdecl printf(const char*, ...);
+}
+
+using namespace nv;
+
+// shared memory copy (instead of relying on constant cache)
+__shared__ cloth::CuClothData gClothData;
+__shared__ cloth::CuFrameData gFrameData;
+__shared__ cloth::CuIterationData gIterData;
+
+// Our way to create stream local variables
+__shared__ void* gProfileBuffer;
+__shared__ uint32_t gProfileBaseId;
+
+static const uint32_t gCuClothDataSize = sizeof(cloth::CuClothData) / sizeof(float);
+static const uint32_t gCuFrameDataSize = sizeof(cloth::CuFrameData) / sizeof(float);
+static const uint32_t gCuIterationDataSize = sizeof(cloth::CuIterationData) / sizeof(float);
+static const uint32_t gCuPhaseConfigSize = sizeof(cloth::CuPhaseConfig) / sizeof(float);
+
+/*
+Memory block for all temporary data in shared memory (in 'allocation' order).
+The numbers indicate the allocation slot if used a stack allocator.
+0) simulate*()::configs (numPhases*sizeof(CuPhaseConfig))
+1) simulate*()::particles ({0,1,2}*4*numParticles floats)
+2) CuCollision::mCapsuleIndices, mCapsuleMasks, mConvexMasks (numCapsules*4+numConvexes ints)
+3) CuCollision::mPrevData (4*numSpheres+10*numCones floats)
+4) CuCollision::collideConvexes() (4*numPlanes floats)
+4) CuCollision::collideTriangles() (19*numTriangles floats)
+4) CuCollision::mCurData::Spheres (4*numSpheres floats)
+5) computeParticleBounds()::dst (192 floats written, 208 float read)
+5) computeSphereBounds()::dst (192 floats written, 208 floats read)
+5) CuCollision::mCurData::Cones (10*numCones floats)
+6) CuCollision::mShapeGrid (2*6*sGridSize=96 floats)
+4) CuSelfCollision::buildAcceleration()::buffer (34*16=544 ints)
+*/
+extern __shared__ float gSharedMemory[];
+extern __shared__ int32_t gSharedSigned[];
+extern __shared__ uint32_t gSharedUnsigned[];
+
+/***************** Pointer Wrappers **********************/
+enum AddressSpace
+{
+	Shared,
+	Global
+};
+
+template <AddressSpace, typename T>
+__device__ T load(const T* ptr);
+template <AddressSpace, typename T>
+__device__ void store(T* ptr, const T& value);
+
+#if !CONVERT_ADDRESSES
+template <AddressSpace, typename T>
+__device__ T load(const T* ptr)
+{
+	return *ptr;
+}
+template <AddressSpace, typename T>
+__device__ void store(T* ptr, const T& value)
+{
+	*ptr = value;
+}
+#else
+template <>
+__device__ float load<Shared>(const float* ptr)
+{
+	float value;
+	asm("ld.shared.f32 %0, [%1];" : "=f"(value) : POINTER_CONSTRAINT(ptr));
+	return value;
+}
+template <>
+__device__ int32_t load<Shared>(const int32_t* ptr)
+{
+	int32_t value;
+	asm("ld.shared.s32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr));
+	return value;
+}
+template <>
+__device__ uint32_t load<Shared>(const uint32_t* ptr)
+{
+	uint32_t value;
+	asm("ld.shared.u32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr));
+	return value;
+}
+template <>
+__device__ void store<Shared>(int32_t* ptr, const int32_t& value)
+{
+	asm("st.shared.s32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory");
+}
+template <>
+__device__ void store<Shared>(float* ptr, const float& value)
+{
+	asm("st.shared.f32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "f"(value) : "memory");
+}
+template <>
+__device__ void store<Shared>(uint32_t* ptr, const uint32_t& value)
+{
+	asm("st.shared.u32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory");
+}
+template <>
+__device__ float load<Global>(const float* ptr)
+{
+	float value;
+	asm("ld.global.f32 %0, [%1];" : "=f"(value) : POINTER_CONSTRAINT(ptr));
+	return value;
+}
+template <>
+__device__ int32_t load<Global>(const int32_t* ptr)
+{
+	int32_t value;
+	asm("ld.global.s32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr));
+	return value;
+}
+template <>
+__device__ uint32_t load<Global>(const uint32_t* ptr)
+{
+	uint32_t value;
+	asm("ld.global.u32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr));
+	return value;
+}
+template <>
+__device__ void store<Global>(int32_t* ptr, const int32_t& value)
+{
+	asm("st.global.s32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory");
+}
+template <>
+__device__ void store<Global>(float* ptr, const float& value)
+{
+	asm("st.global.f32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "f"(value) : "memory");
+}
+template <>
+__device__ void store<Global>(uint32_t* ptr, const uint32_t& value)
+{
+	asm("st.global.u32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory");
+}
+#endif
+
+template <AddressSpace, typename>
+struct Pointer;
+
+template <AddressSpace S, typename T>
+struct Reference
+{
+	template <AddressSpace, typename>
+	friend struct Reference;
+	friend struct Pointer<S, T>;
+
+	__device__ Reference()
+	{
+	}
+	__device__ Reference(const Reference& other) : mPtr(other.mPtr)
+	{
+	}
+	template <typename U>
+	__device__ Reference(const Reference<S, U>& other)
+	: mPtr(other.mPtr)
+	{
+	}
+
+	__device__ Reference& operator = (const Reference& other)
+	{
+		return *this = static_cast<T>(other);
+	}
+	template <typename U>
+	__device__ Reference& operator = (const Reference<S, U>& other)
+	{
+		return *this = static_cast<U>(other);
+	}
+
+	__device__ Reference& operator += (const T& value)
+	{
+		return *this = *this + value;
+	}
+	__device__ Reference& operator |= (const T& value)
+	{
+		return *this = *this | value;
+	}
+	__device__ Reference& operator &= (const T& value)
+	{
+		return *this = *this & value;
+	}
+	__device__ Reference& operator *= (const T& value)
+	{
+		return *this = *this * value;
+	}
+
+	__device__ operator T() const
+	{
+		return load<S>(mPtr);
+	}
+	__device__ Reference& operator = (const T& value)
+	{
+		store<S>(mPtr, value);
+		return *this;
+	}
+
+  //private:
+	T* mPtr;
+
+	__device__ explicit Reference(T& ref) : mPtr(&ref)
+	{
+	}
+
+	template <typename U>
+	friend __device__ void atomicAdd(Reference& ref, U value)
+	{
+		::atomicAdd(ref.mPtr, value);
+	}
+};
+
+template <AddressSpace S, typename T>
+struct Convert
+{
+	static __device__ T* from(T* ptr)
+	{
+		return ptr;
+	}
+	static __device__ T* to(T* ptr)
+	{
+		return ptr;
+	}
+};
+
+#if CONVERT_ADDRESSES
+template <typename T>
+struct Convert<Shared, T>
+{
+	static __device__ T* from(T* ptr)
+	{
+		asm("cvta.shared." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr));
+		return ptr;
+	}
+	static __device__ T* to(T* ptr)
+	{
+		asm("cvta.to.shared." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr));
+		return ptr;
+	}
+};
+template <typename T>
+struct Convert<Global, T>
+{
+	static __device__ T* from(T* ptr)
+	{
+		asm("cvta.global." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr));
+		return ptr;
+	}
+	static __device__ T* to(T* ptr)
+	{
+		asm("cvta.to.global." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr));
+		return ptr;
+	}
+};
+#endif
+
+template <AddressSpace S, typename T>
+__device__ T* generic(const Pointer<S, T>&);
+
+// pointer forced to point to shared memory (only works for sizeof(T) <= 4)
+template <AddressSpace S, typename T>
+struct Pointer
+{
+	template <AddressSpace, typename> friend struct Pointer;
+	friend __device__ T* generic<S, T>(const Pointer<S, T>&);
+	friend struct GlobalParticleData;
+
+	__device__ Pointer()
+	{
+	}
+	__device__ Pointer(const Pointer& other) 
+		: mPtr(other.mPtr)
+	{
+	}
+	template <typename U>
+	__device__ Pointer(const Pointer<S, U>& other)
+	: mPtr(other.mPtr)
+	{
+	}
+
+	// construct from generic pointer
+	__device__ explicit Pointer(T* ptr) 
+		: mPtr(Convert<S, T>::to(ptr))
+	{
+	}
+
+	__device__ bool operator!=(const Pointer& other) const
+	{
+		return mPtr != other.mPtr;
+	}
+	__device__ bool operator<(const Pointer& other) const
+	{
+		return mPtr < other.mPtr;
+	}
+
+	__device__ Pointer operator + (ptrdiff_t i) const
+	{
+		return Pointer(*this) += i;
+	}
+	__device__ Pointer& operator += (ptrdiff_t i)
+	{
+		mPtr += i * stride();
+		return *this;
+	}
+	__device__ Pointer operator - (ptrdiff_t i) const
+	{
+		return Pointer(*this) -= i;
+	}
+	__device__ Pointer& operator -= (ptrdiff_t i)
+	{
+		mPtr -= i * stride();
+		return *this;
+	}
+
+	__device__ Pointer& operator ++ ()
+	{
+		mPtr += stride();
+		return *this;
+	}
+	__device__ Pointer& operator -- ()
+	{
+		mPtr -= stride();
+		return *this;
+	}
+
+	__device__ Reference<S, T> operator*() const
+	{
+		return Reference<S, T>(*mPtr);
+	}
+	__device__ Reference<S, T> operator[](int32_t i) const
+	{
+		return Reference<S, T>(mPtr[i * stride()]);
+	}
+
+  private:
+	// convert back to generic pointer, private for safety, use generic() instead
+	__device__ operator T*() const
+	{
+		return Convert<S, T>::from(mPtr);
+	}
+
+	__device__ static size_t stride() { return 1; }
+
+	template <typename U>
+	__device__ Pointer(const Pointer<S, U>& other, ptrdiff_t stridedOffset)
+		: mPtr(other.mPtr + stridedOffset)
+	{
+	}
+
+	T* mPtr;
+};
+
+// pointers to global memory are all referring to particle data
+// stored as array of structs, so they have a stride of 4.
+template<> __device__ size_t Pointer<Global, float>::stride() { return 4; }
+template<> __device__ size_t Pointer<Global, const float>::stride() { return 4; }
+template <AddressSpace S, typename T>
+__device__ T* generic(const Pointer<S, T>& ptr)
+{
+	return ptr;
+}
+
+#if !CONVERT_ADDRESSES
+template <typename T>
+__device__ T* generic(T* ptr)
+{
+	return ptr;
+}
+#endif
+
+/***************** Particle Data **********************/
+
+template <typename T>
+struct SharedParticleReference
+{
+	__device__ operator float3() const
+	{
+		float3 result;
+		result.x = mReferences[0];
+		result.y = mReferences[1];
+		result.z = mReferences[2];
+		return result;
+	}
+
+	__device__ SharedParticleReference& operator = (const float3& vec)
+	{
+		mReferences[0] = vec.x;
+		mReferences[1] = vec.y;
+		mReferences[2] = vec.z;
+		return *this;
+	}
+
+	__device__ operator float4() const
+	{
+		float4 result;
+		result.x = mReferences[0];
+		result.y = mReferences[1];
+		result.z = mReferences[2];
+		result.w = mReferences[3];
+		return result;
+	}
+
+	__device__ SharedParticleReference& operator = (const float4& vec)
+	{
+		mReferences[0] = vec.x;
+		mReferences[1] = vec.y;
+		mReferences[2] = vec.z;
+		mReferences[3] = vec.w;
+		return *this;
+	}
+
+	Reference<Shared, T> mReferences[4];
+};
+
+struct SharedParticleData
+{
+	typedef float3 VectorType;
+
+	typedef Pointer<Shared, float> PointerType;
+	typedef Pointer<Shared, const float> ConstPointerType;
+	typedef Reference<Shared, float> ReferenceType;
+	typedef Reference<Shared, const float> ConstReferenceType;
+
+	typedef SharedParticleReference<float> ParticleReferenceType;
+	typedef SharedParticleReference<const float> ParticleConstReferenceType;
+
+	__device__ ReferenceType operator()(int32_t index, int32_t element)
+	{
+		return mPointers[element][index];
+	}
+	__device__ ConstReferenceType operator()(int32_t index, int32_t element) const
+	{
+		return mPointers[element][index];
+	}
+
+	__device__ ParticleReferenceType operator()(int32_t index)
+	{
+		ParticleReferenceType result = { mPointers[0][index], mPointers[1][index],
+			                             mPointers[2][index], mPointers[3][index] };
+		return result;
+	}
+	__device__ ParticleConstReferenceType operator()(int32_t index) const
+	{
+		ParticleConstReferenceType result = { mPointers[0][index], mPointers[1][index],
+			                                  mPointers[2][index], mPointers[3][index] };
+		return result;
+	}
+
+	__device__ const PointerType& operator[](int32_t element)
+	{
+		return mPointers[element];
+	}
+	__device__ ConstPointerType operator[](int32_t element) const
+	{
+		return mPointers[element];
+	}
+
+	PointerType mPointers[4];
+};
+
+template <typename T>
+struct GlobalParticleReference
+{
+	__device__ GlobalParticleReference(Pointer<Global, T> ref) : mPtr(reinterpret_cast<T* const&>(ref))
+	{
+	}
+
+#if CONVERT_ADDRESSES
+	__device__ operator float4() const
+	{
+		float4 vec;
+		asm("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];"
+		    : "=f"(vec.x), "=f"(vec.y), "=f"(vec.z), "=f"(vec.w)
+		    : POINTER_CONSTRAINT(mPtr));
+		return vec;
+	}
+
+	__device__ GlobalParticleReference& operator = (const float4& vec)
+	{
+		asm("st.global.v4.f32 [%0], {%1, %2, %3, %4};" ::POINTER_CONSTRAINT(mPtr), "f"(vec.x), "f"(vec.y), "f"(vec.z),
+		    "f"(vec.w)
+		    : "memory");
+		return *this;
+	}
+
+	__device__ operator float3() const
+	{
+		float4 vec = *this;
+		return make_float3(vec.x, vec.y, vec.z);
+	}
+#else
+
+	__device__ operator float4() const
+	{
+		return *reinterpret_cast<const float4*>(mPtr);
+	}
+
+	__device__ GlobalParticleReference& operator = (const float4& vec)
+	{
+		*reinterpret_cast<float4*>(mPtr) = vec;
+		return *this;
+	}
+
+	__device__ operator float3() const
+	{
+		return *reinterpret_cast<const float3*>(mPtr);
+	}
+
+	__device__ GlobalParticleReference& operator = (const float3& vec)
+	{
+		*reinterpret_cast<float3*>(mPtr) = vec;
+		return *this;
+	}
+#endif
+
+	T* mPtr; // pointer to global address
+};
+
+struct GlobalParticleData
+{
+#if CONVERT_ADDRESSES
+	// ld.global.v4 saturates memory bandwidth better than 3x ld.global
+	typedef float4 VectorType;
+#else
+	// the same isn't true for ld without state space
+	typedef float3 VectorType;
+#endif
+
+	typedef Pointer<Global, float> PointerType;
+	typedef Pointer<Global, const float> ConstPointerType;
+	typedef Reference<Global, float> ReferenceType;
+	typedef Reference<Global, const float> ConstReferenceType;
+
+	typedef GlobalParticleReference<float> ParticleReferenceType;
+	typedef GlobalParticleReference<const float> ParticleConstReferenceType;
+
+	__device__ ReferenceType operator()(int32_t index, int32_t element)
+	{
+		return *PointerType(mPtr, index * 4 + element);
+	}
+	__device__ ConstReferenceType operator()(int32_t index, int32_t element) const
+	{
+		return *ConstPointerType(mPtr, index * 4 + element);
+	}
+
+	__device__ ParticleReferenceType operator()(int32_t index)
+	{
+		return PointerType(mPtr, index * 4);
+	}
+
+	__device__ ParticleConstReferenceType operator()(int32_t index) const
+	{
+		return ConstPointerType(mPtr, index * 4);
+	}
+
+	__device__ PointerType operator[](int32_t element)
+	{
+		return PointerType(mPtr, element);
+	}
+	__device__ ConstPointerType operator[](int32_t element) const
+	{
+		return ConstPointerType(mPtr, element);
+	}
+
+	PointerType mPtr;
+};
+
+/***************** Profiling **********************/
+struct ProfileDisabledZone
+{
+	__device__ ProfileDisabledZone(cloth::CuProfileZoneIds::Enum)
+	{
+	}
+};
+
+#if defined(__CUDA_ARCH__) && defined(PX_PROFILE) // profile zones enabled for profile build
+
+// code below is copied from GPUProfile.h and needs to be kept in sync.
+
+#define NUM_WARPS_PER_PROFILE_BUFFER (4 * 1024 * 1024)
+
+struct __align__(16) WarpProfileEvent
+{
+	__device__ WarpProfileEvent(uint16_t id)
+	: block(blockIdx.x + gridDim.x * blockIdx.y), warp(threadIdx.x >> 5), userData(0), eventId(id)
+	{
+		uint32_t smid32, warpid32;
+		asm volatile("mov.u32 %0, %smid;" : "=r"(smid32));
+		asm volatile("mov.u32 %0, %warpid;" : "=r"(warpid32));
+		asm volatile("mov.u32 %0, %clock;" : "=r"(startTime));
+		smid = smid32;
+		warpid = warpid32;
+		endTime = startTime;
+	}
+
+	uint16_t block;
+	uint8_t warp;
+	uint8_t smid;
+	uint8_t warpid;
+	uint8_t userData;
+	uint16_t eventId;
+	uint32_t startTime;
+	uint32_t endTime;
+};
+
+struct ProfileZone
+{
+	__device__ ProfileZone(cloth::CuProfileZoneIds::Enum id) : mEvent(0)
+	{
+		if (!gProfileBuffer || threadIdx.x & 0x1f)
+			return;
+
+		// +1: first entry reserved for counter
+		uint32_t index = atomicAdd(reinterpret_cast<uint32_t*>(gProfileBuffer), 1) + 1;
+
+		if (index >= NUM_WARPS_PER_PROFILE_BUFFER)
+			return;
+
+		mEvent = reinterpret_cast<WarpProfileEvent*>(gProfileBuffer) + index;
+
+		new (mEvent) WarpProfileEvent(gProfileBaseId + id);
+	}
+
+	__device__ ~ProfileZone()
+	{
+		if (mEvent)
+			mEvent->endTime = clock();
+	}
+
+	WarpProfileEvent* mEvent;
+};
+
+#else
+typedef ProfileDisabledZone ProfileZone;
+#endif
+
+#if 1 // set to 1 to enable detailed profile zones
+typedef ProfileZone ProfileDetailZone;
+#else
+typedef ProfileDisabledZone ProfileDetailZone;
+#endif
+
+namespace
+{
+// cut down version of thrust::uninitialized
+// avoids warning about non-empty c'tor
+template <typename T>
+struct uninitialized
+{
+	__device__ inline T& get()
+	{
+		return *reinterpret_cast<T*>(data);
+	}
+
+	// maximum alignment required by device code is 16
+	__align__(16) unsigned char data[sizeof(T)];
+};
+}
+
+#if __CUDA_ARCH__ < 320
+namespace
+{
+template <typename T>
+__device__ T __ldg(const T* __restrict ptr)
+{
+	return *ptr;
+}
+}
+#endif
+
+#define CU_SOLVER_KERNEL_CU
+#include "CuCollision.h"
+#include "CuSelfCollision.h"
+
+namespace
+{
+__device__ void loadIterData(const cloth::CuIterationData* __restrict iterData)
+{
+	if (threadIdx.x < gCuIterationDataSize)
+	{
+		gIterData.mIntegrationTrafo[threadIdx.x] = __ldg(iterData->mIntegrationTrafo + threadIdx.x);
+	}
+}
+
+// integrate particle positions and store transposed
+template <bool IsTurning, typename CurrentT, typename PreviousT>
+__device__ void integrateParticles(CurrentT& current, PreviousT& previous)
+{
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::INTEGRATE);
+
+	const float* __restrict trafo = gIterData.mIntegrationTrafo;
+
+	for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+	{
+		float4 prev = previous(i);
+		float4 next = current(i);
+		float4 cur = { next.x, next.y, next.z, prev.w };
+
+		if (next.w == 0.0f)
+			next.w = prev.w;
+
+		if (next.w > 0.0f)
+		{
+			if (IsTurning)
+			{
+				next.x = next.x + trafo[3] + cur.x * trafo[15] + prev.x * trafo[6] + cur.y * trafo[16] +
+				         prev.y * trafo[7] + cur.z * trafo[17] + prev.z * trafo[8];
+
+				next.y = next.y + trafo[4] + cur.x * trafo[18] + prev.x * trafo[9] + cur.y * trafo[19] +
+				         prev.y * trafo[10] + cur.z * trafo[20] + prev.z * trafo[11];
+
+				next.z = next.z + trafo[5] + cur.x * trafo[21] + prev.x * trafo[12] + cur.y * trafo[22] +
+				         prev.y * trafo[13] + cur.z * trafo[23] + prev.z * trafo[14];
+			}
+			else
+			{
+				next.x += (cur.x - prev.x) * trafo[6] + trafo[3];
+				next.y += (cur.y - prev.y) * trafo[9] + trafo[4];
+				next.z += (cur.z - prev.z) * trafo[12] + trafo[5];
+			}
+
+			cur.x += trafo[0];
+			cur.y += trafo[1];
+			cur.z += trafo[2];
+		}
+
+		current(i) = next;
+		previous(i) = cur;
+	}
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void integrateParticles(CurrentT& current, PreviousT& previous)
+{
+	if (gIterData.mIsTurning)
+		integrateParticles<true>(current, previous);
+	else
+		integrateParticles<false>(current, previous);
+}
+
+template <typename CurrentT>
+__device__ void accelerateParticles(CurrentT& current)
+{
+	// might be better to move this into integrate particles
+	const float* __restrict accelerations = gFrameData.mParticleAccelerations;
+
+	if (!accelerations)
+		return;
+
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::ACCELERATE);
+
+	__syncthreads(); // looping with 4 instead of 1 thread per particle
+
+	float sqrIterDt = ~threadIdx.x & 0x3 ? gFrameData.mIterDt * gFrameData.mIterDt : 0.0f;
+	typename CurrentT::PointerType sharedCurPos = current[threadIdx.x % 4];
+
+	for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x)
+	{
+		// turning this into __ldg slows kernel down even without particle accelerations (!)
+		if (current(i / 4, 3) > 0.0f)
+			sharedCurPos[i / 4] += accelerations[i] * sqrIterDt;
+	}
+
+	__syncthreads();
+}
+
+__device__ float3 operator + (const float3& u, const float3& v)
+{
+	return make_float3(u.x + v.x, u.y + v.y, u.z + v.z);
+}
+__device__ float3 operator - (const float3& u, const float3& v)
+{
+	return make_float3(u.x - v.x, u.y - v.y, u.z - v.z);
+}
+__device__ float3 operator*(float s, const float3& v)
+{
+	return make_float3(v.x * s, v.y * s, v.z * s);
+}
+__device__ float dot3(const float3& u, const float3& v)
+{
+	return u.x * v.x + u.y * v.y + u.z * v.z;
+}
+__device__ float3 cross3(const float3& u, const float3& v)
+{
+	return make_float3(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x);
+}
+__device__ void applyImpulse(SharedParticleData::ParticleReferenceType pos, const float3& impulse)
+{
+	float scale = -pos.mReferences[3];
+	//Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory
+	asm("red.shared.add.f32 [%0], %1;" :: POINTER_CONSTRAINT(pos.mReferences[0].mPtr), "f"(impulse.x * scale));
+	asm("red.shared.add.f32 [%0], %1;" :: POINTER_CONSTRAINT(pos.mReferences[1].mPtr), "f"(impulse.y * scale));
+	asm("red.shared.add.f32 [%0], %1;" :: POINTER_CONSTRAINT(pos.mReferences[2].mPtr), "f"(impulse.z * scale));
+}
+__device__ void applyImpulse(GlobalParticleData::ParticleReferenceType pos, const float3& impulse)
+{
+	float scale = -pos.mPtr[3];
+	atomicAdd(pos.mPtr + 0, impulse.x * scale);
+	atomicAdd(pos.mPtr + 1, impulse.y * scale);
+	atomicAdd(pos.mPtr + 2, impulse.z * scale);
+}
+
+template <bool IsTurning, typename CurrentT, typename PreviousT>
+__device__ void applyWind(CurrentT& current, PreviousT& previous)
+{
+	const float dragCoefficient = gFrameData.mDragCoefficient;
+	const float liftCoefficient = gFrameData.mLiftCoefficient;
+
+	if (dragCoefficient == 0.0f && liftCoefficient == 0.0f)
+		return;
+
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::WIND);
+
+	const float oneThird = 1 / 3.0f;
+	float3 wind = make_float3(gIterData.mWind[0], gIterData.mWind[1], gIterData.mWind[2]);
+
+	const uint16_t* tIt = gClothData.mTriangles;
+	for (int32_t i = threadIdx.x; i < gClothData.mNumTriangles; i += blockDim.x)
+	{
+		uint16_t i0 = tIt[i * 3 + 0];
+		uint16_t i1 = tIt[i * 3 + 1];
+		uint16_t i2 = tIt[i * 3 + 2];
+
+		float3 c0 = current(i0);
+		float3 c1 = current(i1);
+		float3 c2 = current(i2);
+
+	//	float w1 = current(i0, 3);
+	//	float w2 = current(i1, 3);
+	//	float w2 = current(i2, 3);
+	//
+	//	float wMult = w1 * w2 * w3;
+	//	float invMass = wMult < FLT_EPSILON ? 0.f : w1 * w2 * w3 / (w1 * w2 + w1 * w3 + w2 * w3);
+
+		float3 p0 = previous(i0);
+		float3 p1 = previous(i1);
+		float3 p2 = previous(i2);
+
+		float3 cur = oneThird * (c0 + c1 + c2);
+		float3 prev = oneThird * (p0 + p1 + p2);
+
+		float3 delta = cur - prev + wind;
+
+		if (IsTurning)
+		{
+			const float3* rot = reinterpret_cast<const float3*>(gFrameData.mRotation);
+			float3 d = wind - prev;
+			delta = cur + d.x * rot[0] + d.y * rot[1] + d.z * rot[2];
+		}
+
+		float3 normal = cross3(c2 - c0, c1 - c0);
+
+		float doubleArea = sqrtf(dot3(normal, normal));
+
+		float invSqrScale = dot3(delta, delta);
+		float scale = rsqrtf(invSqrScale);
+
+		float cosTheta = dot3(normal, delta) * scale / doubleArea;
+		float sinTheta = sqrtf(max(0.0f, 1.0f - cosTheta * cosTheta));
+
+		float3 liftDir = cross3(cross3(delta, normal), scale * delta);
+
+		float3 lift = liftCoefficient * cosTheta * sinTheta * liftDir;
+		float3 drag = dragCoefficient * abs(cosTheta) * doubleArea * delta;
+
+		float3 impulse = invSqrScale < FLT_EPSILON ? make_float3(0.0f, 0.0f, 0.0f) : lift + drag;
+
+		applyImpulse(current(i0), impulse);
+		applyImpulse(current(i1), impulse);
+		applyImpulse(current(i2), impulse);
+	}
+
+	__syncthreads();
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void applyWind(CurrentT& current, PreviousT& previous)
+{
+	if (gIterData.mIsTurning)
+		applyWind<true>(current, previous);
+	else
+		applyWind<false>(current, previous);
+}
+
+template <typename CurrentT>
+__device__ void constrainTether(CurrentT& current)
+{
+	if (0.0f == gFrameData.mTetherConstraintStiffness || !gClothData.mNumTethers)
+		return;
+
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::TETHER);
+
+	int32_t numParticles = gClothData.mNumParticles;
+	int32_t numTethers = gClothData.mNumTethers;
+	assert(0 == numTethers % numParticles);
+
+	float stiffness = numParticles * __fdividef(gFrameData.mTetherConstraintStiffness, numTethers);
+	float scale = gClothData.mTetherConstraintScale;
+
+	const uint32_t* __restrict tIt = reinterpret_cast<const uint32_t*>(gClothData.mTethers);
+
+	for (int32_t i = threadIdx.x; i < numParticles; i += blockDim.x)
+	{
+		float posX = current(i, 0);
+		float posY = current(i, 1);
+		float posZ = current(i, 2);
+
+		float offsetX = 0.0f;
+		float offsetY = 0.0f;
+		float offsetZ = 0.0f;
+
+		for (int32_t j = i; j < numTethers; j += gClothData.mNumParticles)
+		{
+			uint32_t tether = __ldg(tIt + j);
+
+			int32_t anchor = tether & 0xffff;
+			float deltaX = current(anchor, 0) - posX;
+			float deltaY = current(anchor, 1) - posY;
+			float deltaZ = current(anchor, 2) - posZ;
+
+			float sqrLength = FLT_EPSILON + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+			float radius = (tether >> 16) * scale;
+			float slack = 1.0f - radius * rsqrtf(sqrLength);
+
+			if (slack > 0.0f)
+			{
+				offsetX += deltaX * slack;
+				offsetY += deltaY * slack;
+				offsetZ += deltaZ * slack;
+			}
+		}
+
+		current(i, 0) = posX + offsetX * stiffness;
+		current(i, 1) = posY + offsetY * stiffness;
+		current(i, 2) = posZ + offsetZ * stiffness;
+	}
+}
+
+template <typename CurrentT>
+__device__ void solveFabric(CurrentT& current)
+{
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::FABRIC);
+
+	const cloth::CuPhaseConfig* __restrict cIt = (cloth::CuPhaseConfig*)gSharedMemory;
+	const cloth::CuPhaseConfig* cEnd = cIt + gClothData.mNumPhases;
+
+	for (; cIt != cEnd; ++cIt)
+	{
+		__syncthreads();
+
+		ProfileDetailZone zone(cloth::CuProfileZoneIds::CONSTRAINT_SET);
+
+		int32_t numConstraints = cIt->mNumConstraints;
+		if (threadIdx.x >= numConstraints)
+			continue;
+
+		const uint32_t* __restrict iIt = reinterpret_cast<const uint32_t*>(cIt->mIndices) + threadIdx.x;
+		const float* restvalues = cIt->mRestvalues;
+		const float* rIt = restvalues + threadIdx.x;
+		const float* rEnd = restvalues + numConstraints;
+
+		const float* stIt = cIt->mStiffnessValues + threadIdx.x;
+		bool useStiffnessPerConstraint = cIt->mStiffnessValues!=nullptr;
+
+		uint32_t vpijPrefetch = __ldg(iIt);
+		float rijPrefetch = __ldg(rIt);
+		float stijPrefetch;
+		if (useStiffnessPerConstraint)
+			stijPrefetch = __ldg(stIt);
+
+		float stiffness = cIt->mStiffness;
+		float stiffnessMultiplier = cIt->mStiffnessMultiplier;
+		float compressionLimit = cIt->mCompressionLimit;
+		float stretchLimit = cIt->mStretchLimit;
+
+		do
+		{
+			rIt += blockDim.x;
+			iIt += blockDim.x;
+			stIt += blockDim.x;
+
+			int32_t vpi = USHRT_MAX & vpijPrefetch;
+			int32_t vpj = USHRT_MAX & vpijPrefetch >> 16;
+			float rij = rijPrefetch;
+			float stij = useStiffnessPerConstraint?1.0f - exp2f(stijPrefetch * gFrameData.mStiffnessExponent):stiffness;
+
+			if (rIt < rEnd)
+			{
+				vpijPrefetch = __ldg(iIt);
+				rijPrefetch = __ldg(rIt);
+				if (useStiffnessPerConstraint)
+					stijPrefetch = __ldg(stIt);
+			}
+
+			float vxi = current(vpi, 0);
+			float vyi = current(vpi, 1);
+			float vzi = current(vpi, 2);
+			float vwi = current(vpi, 3);
+
+			float vxj = current(vpj, 0);
+			float vyj = current(vpj, 1);
+			float vzj = current(vpj, 2);
+			float vwj = current(vpj, 3);
+
+			float hxij = vxj - vxi;
+			float hyij = vyj - vyi;
+			float hzij = vzj - vzi;
+
+			float e2ij = FLT_EPSILON + hxij * hxij + hyij * hyij + hzij * hzij;
+			float negErij = rij > FLT_EPSILON ? -1.0f + rij * rsqrtf(e2ij) : 0.0f;
+
+			negErij = negErij + stiffnessMultiplier * max(compressionLimit, min(-negErij, stretchLimit));
+
+			float negExij = __fdividef(negErij * stij, FLT_EPSILON + vwi + vwj);
+
+			float vmi = -vwi * negExij;
+			current(vpi, 0) = vxi + vmi * hxij;
+			current(vpi, 1) = vyi + vmi * hyij;
+			current(vpi, 2) = vzi + vmi * hzij;
+
+			float vmj = +vwj * negExij;
+			current(vpj, 0) = vxj + vmj * hxij;
+			current(vpj, 1) = vyj + vmj * hyij;
+			current(vpj, 2) = vzj + vmj * hzij;
+
+		} while (rIt < rEnd);
+	}
+
+	__syncthreads();
+}
+
+template <typename CurrentT>
+__device__ void constrainMotion(CurrentT& current, float alpha)
+{
+	if (!gFrameData.mStartMotionConstraints)
+		return;
+
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::MOTION);
+
+	// negative because of fused multiply-add optimization
+	float negativeScale = -gClothData.mMotionConstraintScale;
+	float negativeBias = -gClothData.mMotionConstraintBias;
+
+	const float4* startIt = reinterpret_cast<const float4*>(gFrameData.mStartMotionConstraints);
+	const float4* targetIt = reinterpret_cast<const float4*>(gFrameData.mTargetMotionConstraints);
+
+	for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+	{
+		float4 startPos = __ldg(startIt + i);
+		float4 targetPos = __ldg(targetIt + i);
+
+		float sphereX = startPos.x + (targetPos.x - startPos.x) * alpha;
+		float sphereY = startPos.y + (targetPos.y - startPos.y) * alpha;
+		float sphereZ = startPos.z + (targetPos.z - startPos.z) * alpha;
+		float sphereW = startPos.w + (targetPos.w - startPos.w) * alpha;
+
+		float dx = sphereX - current(i, 0);
+		float dy = sphereY - current(i, 1);
+		float dz = sphereZ - current(i, 2);
+
+		float sqrLength = FLT_EPSILON + dx * dx + dy * dy + dz * dz;
+		float negativeRadius = min(0.0f, sphereW * negativeScale + negativeBias);
+
+		float slack = max(negativeRadius * rsqrtf(sqrLength) + 1.0f, 0.0f) * gFrameData.mMotionConstraintStiffness;
+
+		current(i, 0) += slack * dx;
+		current(i, 1) += slack * dy;
+		current(i, 2) += slack * dz;
+
+		// set invMass to zero if radius is zero
+		if (negativeRadius >= 0.0f)
+			current(i, 3) = 0.0f;
+	}
+}
+
+template <typename T>
+__device__ void constrainSeparation(T& current, float alpha)
+{
+	if (!gFrameData.mStartSeparationConstraints)
+		return;
+
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::SEPARATION);
+
+	const float4* startIt = reinterpret_cast<const float4*>(gFrameData.mStartSeparationConstraints);
+	const float4* targetIt = reinterpret_cast<const float4*>(gFrameData.mTargetSeparationConstraints);
+
+	for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+	{
+		float4 startPos = __ldg(startIt + i);
+		float4 targetPos = __ldg(targetIt + i);
+
+		float sphereX = startPos.x + (targetPos.x - startPos.x) * alpha;
+		float sphereY = startPos.y + (targetPos.y - startPos.y) * alpha;
+		float sphereZ = startPos.z + (targetPos.z - startPos.z) * alpha;
+		float sphereW = startPos.w + (targetPos.w - startPos.w) * alpha;
+
+		float dx = sphereX - current(i, 0);
+		float dy = sphereY - current(i, 1);
+		float dz = sphereZ - current(i, 2);
+
+		float sqrLength = FLT_EPSILON + dx * dx + dy * dy + dz * dz;
+
+		float slack = min(0.0f, 1.0f - sphereW * rsqrtf(sqrLength));
+
+		current(i, 0) += slack * dx;
+		current(i, 1) += slack * dy;
+		current(i, 2) += slack * dz;
+	}
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void updateSleepState(const CurrentT& current, const PreviousT& previous)
+{
+	ProfileDetailZone zone(cloth::CuProfileZoneIds::SLEEP);
+
+	if (!threadIdx.x)
+		gFrameData.mSleepTestCounter += max(1, uint32_t(gFrameData.mIterDt * 1000));
+
+	__syncthreads();
+
+	if (gFrameData.mSleepTestCounter < gClothData.mSleepTestInterval)
+		return;
+
+	float maxDelta = 0.0f;
+	for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x)
+	{
+		float4 prev = previous(i);
+		maxDelta = max(fabsf(current(i, 0) - prev.x), maxDelta);
+		maxDelta = max(fabsf(current(i, 1) - prev.y), maxDelta);
+		maxDelta = max(fabsf(current(i, 2) - prev.z), maxDelta);
+	}
+
+	if (!threadIdx.x)
+	{
+		++gFrameData.mSleepPassCounter;
+		gFrameData.mSleepTestCounter -= gClothData.mSleepTestInterval;
+	}
+
+	__syncthreads();
+
+	if (maxDelta > gClothData.mSleepThreshold * gFrameData.mIterDt)
+		gFrameData.mSleepPassCounter = 0;
+}
+
+template <typename CurrentT, typename PreviousT>
+__device__ void simulateCloth(CurrentT& current, PreviousT& previous)
+{
+	// apply exponent to phase configs
+	assert(blockDim.x >= gClothData.mNumPhases);
+	if (threadIdx.x < gClothData.mNumPhases)
+	{
+		float exponent = gFrameData.mStiffnessExponent;
+		float* ptr = gSharedMemory + threadIdx.x * gCuPhaseConfigSize;
+		ptr[0] = 1.0f - exp2f(ptr[0] * exponent);
+		ptr[1] = 1.0f - exp2f(ptr[1] * exponent);
+	}
+
+	uint32_t numIterations = gFrameData.mNumIterations;
+	float invNumIterations = __fdividef(1.0f, numIterations);
+
+	const cloth::CuIterationData* iterData = gFrameData.mIterationData;
+	const cloth::CuIterationData* iterEnd = iterData + numIterations;
+
+	loadIterData(iterData);
+
+	__syncthreads();
+
+	for (float alpha = invNumIterations; iterData++ != iterEnd; alpha += invNumIterations)
+	{
+		integrateParticles(current, previous);
+		accelerateParticles(current);
+		applyWind(current, previous);
+		constrainMotion(current, alpha);
+		constrainTether(current);
+		solveFabric(current);
+		loadIterData(iterData);
+		constrainSeparation(current, alpha);
+		gCollideParticles.get()(current, previous, alpha);
+		gSelfCollideParticles.get()(current);
+		updateSleepState(current, previous);
+	}
+
+	__syncthreads();
+}
+
+__device__ void simulateShared()
+{
+	ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE_SHARED);
+
+	__shared__ uninitialized<SharedParticleData> current;
+	__shared__ uninitialized<SharedParticleData> previous;
+
+	int32_t configDataSize = gClothData.mNumPhases * gCuPhaseConfigSize;
+	int32_t particlesDataSize = 4 * gClothData.mNumParticles;
+
+	Pointer<Shared, float> sharedCurPos =
+	    Pointer<Shared, float>(gSharedMemory + configDataSize + threadIdx.x % 4 * gClothData.mNumParticles);
+	Pointer<Shared, float> sharedPrevPos = sharedCurPos + particlesDataSize;
+
+	if (threadIdx.x < 4)
+	{
+		current.get().mPointers[threadIdx.x] = sharedCurPos;
+		previous.get().mPointers[threadIdx.x] = sharedPrevPos;
+	}
+
+	float* globalCurPos = gClothData.mParticles;
+	float* globalPrevPos = gClothData.mParticles + particlesDataSize;
+
+	// copy particles from device memory to shared memory and transpose
+	for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x)
+	{
+		sharedCurPos[i / 4] = globalCurPos[i];
+		sharedPrevPos[i / 4] = globalPrevPos[i];
+	}
+
+	simulateCloth(current.get(), previous.get());
+
+	// copy particles from shared memory to device memory and transpose
+	for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x)
+	{
+		globalCurPos[i] = sharedCurPos[i / 4];
+		globalPrevPos[i] = sharedPrevPos[i / 4];
+	}
+
+	__syncthreads();
+}
+
+__device__ void simulateStreamed()
+{
+	ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE_STREAMED);
+
+	__shared__ uninitialized<SharedParticleData> current;
+	__shared__ uninitialized<GlobalParticleData> previous;
+
+	int32_t configDataSize = gClothData.mNumPhases * gCuPhaseConfigSize;
+	int32_t particlesDataSize = 4 * gClothData.mNumParticles;
+
+	float* globalCurPos = gClothData.mParticles;
+	Pointer<Shared, float> sharedCurPos =
+	    Pointer<Shared, float>(gSharedMemory + configDataSize + threadIdx.x % 4 * gClothData.mNumParticles);
+
+	if (threadIdx.x < 4)
+		current.get().mPointers[threadIdx.x] = sharedCurPos;
+	if (!threadIdx.x)
+		previous.get().mPtr = GlobalParticleData::PointerType(globalCurPos + particlesDataSize);
+
+	// copy particles from device memory to shared memory and transpose
+	for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x)
+		sharedCurPos[i / 4] = globalCurPos[i];
+
+	simulateCloth(current.get(), previous.get());
+
+	// copy particles from shared memory to device memory and transpose
+	for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x)
+		globalCurPos[i] = sharedCurPos[i / 4];
+
+	__syncthreads();
+}
+
+__device__ void simulateGlobal()
+{
+	ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE_GLOBAL);
+
+	__shared__ uninitialized<GlobalParticleData> current;
+	__shared__ uninitialized<GlobalParticleData> previous;
+
+	if (!threadIdx.x)
+	{
+		GlobalParticleData::PointerType globalCurPos(gClothData.mParticles);
+		current.get().mPtr = globalCurPos;
+		previous.get().mPtr = globalCurPos + gClothData.mNumParticles;
+	}
+
+	simulateCloth(current.get(), previous.get());
+}
+
+} // anonymous namespace
+
+extern "C" __global__ void
+#if __CUDA_ARCH__ >= 300
+__launch_bounds__(1024, 1)
+#else
+__launch_bounds__(512, 1)
+#endif
+    simulateCloths(cloth::CuKernelData kernelData)
+{
+	gProfileBuffer = kernelData.mProfileBuffer;
+	gProfileBaseId = kernelData.mProfileBaseId;
+
+	ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE);
+
+	// check that http://nvbugs/1038473 is fixed
+	assert(gSharedMemory > (float*)&gFrameData);
+	assert(gSharedMemory > (float*)&gClothData);
+
+	// fetch cloth index from queue
+	__shared__ uint32_t clothIdx;
+	if (!threadIdx.x)
+		clothIdx = atomicInc(kernelData.mClothIndex, gridDim.x - 1);
+	__syncthreads();
+	assert(clothIdx < gridDim.x);
+
+	// copy cloth data to shared memory
+	const uint32_t* clothData = reinterpret_cast<const uint32_t*>(kernelData.mClothData + clothIdx);
+	if (threadIdx.x < gCuClothDataSize)
+		reinterpret_cast<uint32_t*>(&gClothData)[threadIdx.x] = clothData[threadIdx.x];
+
+	// copy frame data to shared memory
+	uint32_t* frameData = reinterpret_cast<uint32_t*>(kernelData.mFrameData + clothIdx);
+	if (threadIdx.x < gCuFrameDataSize)
+		reinterpret_cast<uint32_t*>(&gFrameData)[threadIdx.x] = frameData[threadIdx.x];
+
+	__syncthreads();
+
+	if (gFrameData.mSleepPassCounter >= gClothData.mSleepAfterCount)
+		return; // cloth is sleeping, exit
+
+	// copy phase configs to shared memory
+	int32_t configDataSize = gClothData.mNumPhases * gCuPhaseConfigSize;
+	for (int32_t i = threadIdx.x; i < configDataSize; i += blockDim.x)
+		gSharedUnsigned[i] = reinterpret_cast<const uint32_t*>(gClothData.mPhaseConfigs)[i];
+
+	Pointer<Shared, uint32_t> scratchPtr = Pointer<Shared, uint32_t>(
+	    gSharedUnsigned + configDataSize + 4 * gFrameData.mNumSharedPositions * gClothData.mNumParticles);
+
+	// initialize with placement new
+	new (gCollideParticles.data) CuCollision(scratchPtr);
+	new (gSelfCollideParticles.data) CuSelfCollision();
+
+	// copy particles and constraints to device
+	if (gFrameData.mDeviceParticlesDirty)
+	{
+		for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 8; i += blockDim.x)
+			gClothData.mParticles[i] = gClothData.mParticlesHostCopy[i];
+	}
+	if (gFrameData.mHostMotionConstraints)
+	{
+		for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x)
+			gFrameData.mTargetMotionConstraints[i] = gFrameData.mHostMotionConstraints[i];
+	}
+	if (gFrameData.mHostSeparationConstraints)
+	{
+		for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x)
+			gFrameData.mTargetSeparationConstraints[i] = gFrameData.mHostSeparationConstraints[i];
+	}
+	if (gFrameData.mHostParticleAccelerations)
+	{
+		for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x)
+			gFrameData.mParticleAccelerations[i] = gFrameData.mHostParticleAccelerations[i];
+	}
+
+	// necessary to ensure phase configs are fully loaded before setup in simulateCloth()
+	__syncthreads();
+
+	switch(gFrameData.mNumSharedPositions)
+	{
+	case 0:
+		simulateGlobal();
+		break;
+	case 1:
+		simulateStreamed();
+		break;
+	case 2:
+		simulateShared();
+		break;
+	}
+
+	// write back frame data
+	if (threadIdx.x < gCuFrameDataSize)
+		frameData[threadIdx.x] = reinterpret_cast<const uint32_t*>(&gFrameData)[threadIdx.x];
+
+	// copy particles to host
+	for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 8; i += blockDim.x)
+		gClothData.mParticlesHostCopy[i] = gClothData.mParticles[i];
+}
+
+const char* cloth::getKernelFunctionName()
+{
+	return "simulateCloths";
+}
author	mtamis <[email protected]>	2017-02-15 16:06:25 +0100
committer	mtamis <[email protected]>	2017-02-15 16:06:25 +0100
commit	85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch)
tree	45f1bb20a45a300d1fef107e436cac95602a0e57 /NvCloth/src/cuda/CuSolverKernel.cu
download	nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip