// This code contains NVIDIA Confidential Information and is disclosed to you // under a form of NVIDIA software license agreement provided separately to you. // // Notice // NVIDIA Corporation and its licensors retain all intellectual property and // proprietary rights in and to this software and related documentation and // any modifications thereto. Any use, reproduction, disclosure, or // distribution of this software and related documentation without an express // license agreement from NVIDIA Corporation is strictly prohibited. // // ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES // NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO // THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, // MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. // // Information and code furnished is believed to be accurate and reliable. // However, NVIDIA Corporation assumes no responsibility for the consequences of use of such // information or for any infringement of patents or other rights of third parties that may // result from its use. No license is granted by implication or otherwise under any patent // or patent rights of NVIDIA Corporation. Details are subject to change without notice. // This code supersedes and replaces all information previously supplied. // NVIDIA Corporation products are not authorized for use as critical // components in life support devices or systems without express written approval of // NVIDIA Corporation. // // Copyright (c) 2008-2020 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "CuSolverKernel.h" #include "CuClothData.h" #include "CuPhaseConfig.h" #include // placement new /* For detailed comments about the algorithm check SwSolverKernel.cpp (or the documentation) The CPU implementation is generally easier to read, and comments are not duplicated in other implementations. Only CUDA implementation specific comments are left in this implementation. */ #ifndef FLT_EPSILON #define FLT_EPSILON 1.192092896e-07F #endif #ifndef FLT_MAX #define FLT_MAX 3.402823466e+38F #endif // Converting pointers to shared/global addresses is faster than doing generic loads on SM50 #define CONVERT_ADDRESSES (__CUDA_ARCH__ >= 500) #if !defined(_WIN64) && !defined(__x86_64__) #define POINTER_CONSTRAINT "r" #define POINTER_TYPE "u32" #else #define POINTER_CONSTRAINT "l" #define POINTER_TYPE "u64" #endif #ifndef __CUDA_ARCH__ #define assert(x) #endif extern "C" { extern _CRTIMP __host__ __device__ int __cdecl printf(const char*, ...); } using namespace nv; // shared memory copy (instead of relying on constant cache) __shared__ cloth::CuClothData gClothData; __shared__ cloth::CuFrameData gFrameData; __shared__ cloth::CuIterationData gIterData; // Our way to create stream local variables __shared__ void* gProfileBuffer; __shared__ uint32_t gProfileBaseId; static const uint32_t gCuClothDataSize = sizeof(cloth::CuClothData) / sizeof(float); static const uint32_t gCuFrameDataSize = sizeof(cloth::CuFrameData) / sizeof(float); static const uint32_t gCuIterationDataSize = sizeof(cloth::CuIterationData) / sizeof(float); static const uint32_t gCuPhaseConfigSize = sizeof(cloth::CuPhaseConfig) / sizeof(float); /* Memory block for all temporary data in shared memory (in 'allocation' order). The numbers indicate the allocation slot if used a stack allocator. 0) simulate*()::configs (numPhases*sizeof(CuPhaseConfig)) 1) simulate*()::particles ({0,1,2}*4*numParticles floats) 2) CuCollision::mCapsuleIndices, mCapsuleMasks, mConvexMasks (numCapsules*4+numConvexes ints) 3) CuCollision::mPrevData (4*numSpheres+10*numCones floats) 4) CuCollision::collideConvexes() (4*numPlanes floats) 4) CuCollision::collideTriangles() (19*numTriangles floats) 4) CuCollision::mCurData::Spheres (4*numSpheres floats) 5) computeParticleBounds()::dst (192 floats written, 208 float read) 5) computeSphereBounds()::dst (192 floats written, 208 floats read) 5) CuCollision::mCurData::Cones (10*numCones floats) 6) CuCollision::mShapeGrid (2*6*sGridSize=96 floats) 4) CuSelfCollision::buildAcceleration()::buffer (34*16=544 ints) */ extern __shared__ float gSharedMemory[]; extern __shared__ int32_t gSharedSigned[]; extern __shared__ uint32_t gSharedUnsigned[]; /***************** Pointer Wrappers **********************/ enum AddressSpace { Shared, Global }; template __device__ T load(const T* ptr); template __device__ void store(T* ptr, const T& value); #if !CONVERT_ADDRESSES template __device__ T load(const T* ptr) { return *ptr; } template __device__ void store(T* ptr, const T& value) { *ptr = value; } #else template <> __device__ float load(const float* ptr) { float value; asm("ld.shared.f32 %0, [%1];" : "=f"(value) : POINTER_CONSTRAINT(ptr)); return value; } template <> __device__ int32_t load(const int32_t* ptr) { int32_t value; asm("ld.shared.s32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr)); return value; } template <> __device__ uint32_t load(const uint32_t* ptr) { uint32_t value; asm("ld.shared.u32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr)); return value; } template <> __device__ void store(int32_t* ptr, const int32_t& value) { asm("st.shared.s32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory"); } template <> __device__ void store(float* ptr, const float& value) { asm("st.shared.f32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "f"(value) : "memory"); } template <> __device__ void store(uint32_t* ptr, const uint32_t& value) { asm("st.shared.u32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory"); } template <> __device__ float load(const float* ptr) { float value; asm("ld.global.f32 %0, [%1];" : "=f"(value) : POINTER_CONSTRAINT(ptr)); return value; } template <> __device__ int32_t load(const int32_t* ptr) { int32_t value; asm("ld.global.s32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr)); return value; } template <> __device__ uint32_t load(const uint32_t* ptr) { uint32_t value; asm("ld.global.u32 %0, [%1];" : "=r"(value) : POINTER_CONSTRAINT(ptr)); return value; } template <> __device__ void store(int32_t* ptr, const int32_t& value) { asm("st.global.s32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory"); } template <> __device__ void store(float* ptr, const float& value) { asm("st.global.f32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "f"(value) : "memory"); } template <> __device__ void store(uint32_t* ptr, const uint32_t& value) { asm("st.global.u32 [%0], %1;" : : POINTER_CONSTRAINT(ptr), "r"(value) : "memory"); } #endif template struct Pointer; template struct Reference { template friend struct Reference; friend struct Pointer; __device__ Reference() { } __device__ Reference(const Reference& other) : mPtr(other.mPtr) { } template __device__ Reference(const Reference& other) : mPtr(other.mPtr) { } __device__ Reference& operator = (const Reference& other) { return *this = static_cast(other); } template __device__ Reference& operator = (const Reference& other) { return *this = static_cast(other); } __device__ Reference& operator += (const T& value) { return *this = *this + value; } __device__ Reference& operator |= (const T& value) { return *this = *this | value; } __device__ Reference& operator &= (const T& value) { return *this = *this & value; } __device__ Reference& operator *= (const T& value) { return *this = *this * value; } __device__ operator T() const { return load(mPtr); } __device__ Reference& operator = (const T& value) { store(mPtr, value); return *this; } //private: T* mPtr; __device__ explicit Reference(T& ref) : mPtr(&ref) { } template friend __device__ void atomicAdd(Reference& ref, U value) { ::atomicAdd(ref.mPtr, value); } }; template struct Convert { static __device__ T* from(T* ptr) { return ptr; } static __device__ T* to(T* ptr) { return ptr; } }; #if CONVERT_ADDRESSES template struct Convert { static __device__ T* from(T* ptr) { asm("cvta.shared." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr)); return ptr; } static __device__ T* to(T* ptr) { asm("cvta.to.shared." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr)); return ptr; } }; template struct Convert { static __device__ T* from(T* ptr) { asm("cvta.global." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr)); return ptr; } static __device__ T* to(T* ptr) { asm("cvta.to.global." POINTER_TYPE " %0, %0;" : "+" POINTER_CONSTRAINT(ptr)); return ptr; } }; #endif template __device__ T* generic(const Pointer&); // pointer forced to point to shared memory (only works for sizeof(T) <= 4) template struct Pointer { template friend struct Pointer; friend __device__ T* generic(const Pointer&); friend struct GlobalParticleData; __device__ Pointer() { } __device__ Pointer(const Pointer& other) : mPtr(other.mPtr) { } template __device__ Pointer(const Pointer& other) : mPtr(other.mPtr) { } // construct from generic pointer __device__ explicit Pointer(T* ptr) : mPtr(Convert::to(ptr)) { } __device__ bool operator!=(const Pointer& other) const { return mPtr != other.mPtr; } __device__ bool operator<(const Pointer& other) const { return mPtr < other.mPtr; } __device__ Pointer operator + (ptrdiff_t i) const { return Pointer(*this) += i; } __device__ Pointer& operator += (ptrdiff_t i) { mPtr += i * stride(); return *this; } __device__ Pointer operator - (ptrdiff_t i) const { return Pointer(*this) -= i; } __device__ Pointer& operator -= (ptrdiff_t i) { mPtr -= i * stride(); return *this; } __device__ Pointer& operator ++ () { mPtr += stride(); return *this; } __device__ Pointer& operator -- () { mPtr -= stride(); return *this; } __device__ Reference operator*() const { return Reference(*mPtr); } __device__ Reference operator[](int32_t i) const { return Reference(mPtr[i * stride()]); } private: // convert back to generic pointer, private for safety, use generic() instead __device__ operator T*() const { return Convert::from(mPtr); } __device__ static size_t stride() { return 1; } template __device__ Pointer(const Pointer& other, ptrdiff_t stridedOffset) : mPtr(other.mPtr + stridedOffset) { } T* mPtr; }; // pointers to global memory are all referring to particle data // stored as array of structs, so they have a stride of 4. template<> __device__ size_t Pointer::stride() { return 4; } template<> __device__ size_t Pointer::stride() { return 4; } template __device__ T* generic(const Pointer& ptr) { return ptr; } #if !CONVERT_ADDRESSES template __device__ T* generic(T* ptr) { return ptr; } #endif /***************** Particle Data **********************/ template struct SharedParticleReference { __device__ operator float3() const { float3 result; result.x = mReferences[0]; result.y = mReferences[1]; result.z = mReferences[2]; return result; } __device__ SharedParticleReference& operator = (const float3& vec) { mReferences[0] = vec.x; mReferences[1] = vec.y; mReferences[2] = vec.z; return *this; } __device__ operator float4() const { float4 result; result.x = mReferences[0]; result.y = mReferences[1]; result.z = mReferences[2]; result.w = mReferences[3]; return result; } __device__ SharedParticleReference& operator = (const float4& vec) { mReferences[0] = vec.x; mReferences[1] = vec.y; mReferences[2] = vec.z; mReferences[3] = vec.w; return *this; } Reference mReferences[4]; }; struct SharedParticleData { typedef float3 VectorType; typedef Pointer PointerType; typedef Pointer ConstPointerType; typedef Reference ReferenceType; typedef Reference ConstReferenceType; typedef SharedParticleReference ParticleReferenceType; typedef SharedParticleReference ParticleConstReferenceType; __device__ ReferenceType operator()(int32_t index, int32_t element) { return mPointers[element][index]; } __device__ ConstReferenceType operator()(int32_t index, int32_t element) const { return mPointers[element][index]; } __device__ ParticleReferenceType operator()(int32_t index) { ParticleReferenceType result = { mPointers[0][index], mPointers[1][index], mPointers[2][index], mPointers[3][index] }; return result; } __device__ ParticleConstReferenceType operator()(int32_t index) const { ParticleConstReferenceType result = { mPointers[0][index], mPointers[1][index], mPointers[2][index], mPointers[3][index] }; return result; } __device__ const PointerType& operator[](int32_t element) { return mPointers[element]; } __device__ ConstPointerType operator[](int32_t element) const { return mPointers[element]; } PointerType mPointers[4]; }; template struct GlobalParticleReference { __device__ GlobalParticleReference(Pointer ref) : mPtr(reinterpret_cast(ref)) { } #if CONVERT_ADDRESSES __device__ operator float4() const { float4 vec; asm("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(vec.x), "=f"(vec.y), "=f"(vec.z), "=f"(vec.w) : POINTER_CONSTRAINT(mPtr)); return vec; } __device__ GlobalParticleReference& operator = (const float4& vec) { asm("st.global.v4.f32 [%0], {%1, %2, %3, %4};" ::POINTER_CONSTRAINT(mPtr), "f"(vec.x), "f"(vec.y), "f"(vec.z), "f"(vec.w) : "memory"); return *this; } __device__ operator float3() const { float4 vec = *this; return make_float3(vec.x, vec.y, vec.z); } #else __device__ operator float4() const { return *reinterpret_cast(mPtr); } __device__ GlobalParticleReference& operator = (const float4& vec) { *reinterpret_cast(mPtr) = vec; return *this; } __device__ operator float3() const { return *reinterpret_cast(mPtr); } __device__ GlobalParticleReference& operator = (const float3& vec) { *reinterpret_cast(mPtr) = vec; return *this; } #endif T* mPtr; // pointer to global address }; struct GlobalParticleData { #if CONVERT_ADDRESSES // ld.global.v4 saturates memory bandwidth better than 3x ld.global typedef float4 VectorType; #else // the same isn't true for ld without state space typedef float3 VectorType; #endif typedef Pointer PointerType; typedef Pointer ConstPointerType; typedef Reference ReferenceType; typedef Reference ConstReferenceType; typedef GlobalParticleReference ParticleReferenceType; typedef GlobalParticleReference ParticleConstReferenceType; __device__ ReferenceType operator()(int32_t index, int32_t element) { return *PointerType(mPtr, index * 4 + element); } __device__ ConstReferenceType operator()(int32_t index, int32_t element) const { return *ConstPointerType(mPtr, index * 4 + element); } __device__ ParticleReferenceType operator()(int32_t index) { return PointerType(mPtr, index * 4); } __device__ ParticleConstReferenceType operator()(int32_t index) const { return ConstPointerType(mPtr, index * 4); } __device__ PointerType operator[](int32_t element) { return PointerType(mPtr, element); } __device__ ConstPointerType operator[](int32_t element) const { return ConstPointerType(mPtr, element); } PointerType mPtr; }; /***************** Profiling **********************/ struct ProfileDisabledZone { __device__ ProfileDisabledZone(cloth::CuProfileZoneIds::Enum) { } }; #if defined(__CUDA_ARCH__) && defined(PX_PROFILE) // profile zones enabled for profile build // code below is copied from GPUProfile.h and needs to be kept in sync. #define NUM_WARPS_PER_PROFILE_BUFFER (4 * 1024 * 1024) struct __align__(16) WarpProfileEvent { __device__ WarpProfileEvent(uint16_t id) : block(blockIdx.x + gridDim.x * blockIdx.y), warp(threadIdx.x >> 5), userData(0), eventId(id) { uint32_t smid32, warpid32; asm volatile("mov.u32 %0, %smid;" : "=r"(smid32)); asm volatile("mov.u32 %0, %warpid;" : "=r"(warpid32)); asm volatile("mov.u32 %0, %clock;" : "=r"(startTime)); smid = smid32; warpid = warpid32; endTime = startTime; } uint16_t block; uint8_t warp; uint8_t smid; uint8_t warpid; uint8_t userData; uint16_t eventId; uint32_t startTime; uint32_t endTime; }; struct ProfileZone { __device__ ProfileZone(cloth::CuProfileZoneIds::Enum id) : mEvent(0) { if (!gProfileBuffer || threadIdx.x & 0x1f) return; // +1: first entry reserved for counter uint32_t index = atomicAdd(reinterpret_cast(gProfileBuffer), 1) + 1; if (index >= NUM_WARPS_PER_PROFILE_BUFFER) return; mEvent = reinterpret_cast(gProfileBuffer) + index; new (mEvent) WarpProfileEvent(gProfileBaseId + id); } __device__ ~ProfileZone() { if (mEvent) mEvent->endTime = clock(); } WarpProfileEvent* mEvent; }; #else typedef ProfileDisabledZone ProfileZone; #endif #if 1 // set to 1 to enable detailed profile zones typedef ProfileZone ProfileDetailZone; #else typedef ProfileDisabledZone ProfileDetailZone; #endif namespace { // cut down version of thrust::uninitialized // avoids warning about non-empty c'tor template struct uninitialized { __device__ inline T& get() { return *reinterpret_cast(data); } // maximum alignment required by device code is 16 __align__(16) unsigned char data[sizeof(T)]; }; } #if __CUDA_ARCH__ < 320 namespace { template __device__ T __ldg(const T* __restrict ptr) { return *ptr; } } #endif #define CU_SOLVER_KERNEL_CU #include "CuCollision.h" #include "CuSelfCollision.h" namespace { __device__ void loadIterData(const cloth::CuIterationData* __restrict iterData) { if (threadIdx.x < gCuIterationDataSize) { gIterData.mIntegrationTrafo[threadIdx.x] = __ldg(iterData->mIntegrationTrafo + threadIdx.x); } } // integrate particle positions and store transposed template __device__ void integrateParticles(CurrentT& current, PreviousT& previous) { ProfileDetailZone zone(cloth::CuProfileZoneIds::INTEGRATE); const float* __restrict trafo = gIterData.mIntegrationTrafo; for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) { float4 prev = previous(i); float4 next = current(i); float4 cur = { next.x, next.y, next.z, prev.w }; if (next.w == 0.0f) next.w = prev.w; if (next.w > 0.0f) { if (IsTurning) { next.x = next.x + trafo[3] + cur.x * trafo[15] + prev.x * trafo[6] + cur.y * trafo[16] + prev.y * trafo[7] + cur.z * trafo[17] + prev.z * trafo[8]; next.y = next.y + trafo[4] + cur.x * trafo[18] + prev.x * trafo[9] + cur.y * trafo[19] + prev.y * trafo[10] + cur.z * trafo[20] + prev.z * trafo[11]; next.z = next.z + trafo[5] + cur.x * trafo[21] + prev.x * trafo[12] + cur.y * trafo[22] + prev.y * trafo[13] + cur.z * trafo[23] + prev.z * trafo[14]; } else { next.x += (cur.x - prev.x) * trafo[6] + trafo[3]; next.y += (cur.y - prev.y) * trafo[9] + trafo[4]; next.z += (cur.z - prev.z) * trafo[12] + trafo[5]; } cur.x += trafo[0]; cur.y += trafo[1]; cur.z += trafo[2]; } current(i) = next; previous(i) = cur; } } template __device__ void integrateParticles(CurrentT& current, PreviousT& previous) { if (gIterData.mIsTurning) integrateParticles(current, previous); else integrateParticles(current, previous); } template __device__ void accelerateParticles(CurrentT& current) { // might be better to move this into integrate particles const float* __restrict accelerations = gFrameData.mParticleAccelerations; if (!accelerations) return; ProfileDetailZone zone(cloth::CuProfileZoneIds::ACCELERATE); __syncthreads(); // looping with 4 instead of 1 thread per particle float sqrIterDt = ~threadIdx.x & 0x3 ? gFrameData.mIterDt * gFrameData.mIterDt : 0.0f; typename CurrentT::PointerType sharedCurPos = current[threadIdx.x % 4]; for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x) { // turning this into __ldg slows kernel down even without particle accelerations (!) if (current(i / 4, 3) > 0.0f) sharedCurPos[i / 4] += accelerations[i] * sqrIterDt; } __syncthreads(); } __device__ float3 operator + (const float3& u, const float3& v) { return make_float3(u.x + v.x, u.y + v.y, u.z + v.z); } __device__ float3 operator - (const float3& u, const float3& v) { return make_float3(u.x - v.x, u.y - v.y, u.z - v.z); } __device__ float3 operator*(float s, const float3& v) { return make_float3(v.x * s, v.y * s, v.z * s); } __device__ float dot3(const float3& u, const float3& v) { return u.x * v.x + u.y * v.y + u.z * v.z; } __device__ float3 cross3(const float3& u, const float3& v) { return make_float3(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x); } __device__ void applyImpulse(SharedParticleData::ParticleReferenceType pos, const float3& impulse) { float scale = -pos.mReferences[3]; #if CONVERT_ADDRESSES // Use this instead of atomicAdd function to work around compiler issue treating the pointer as global memory instead of shared memory asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[0].mPtr), "f"(impulse.x * scale)); asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[1].mPtr), "f"(impulse.y * scale)); asm("red.shared.add.f32 [%0], %1;" ::POINTER_CONSTRAINT(pos.mReferences[2].mPtr), "f"(impulse.z * scale)); #else atomicAdd(pos.mReferences[0].mPtr, impulse.x * scale); atomicAdd(pos.mReferences[1].mPtr, impulse.y * scale); atomicAdd(pos.mReferences[2].mPtr, impulse.z * scale); #endif } __device__ void applyImpulse(GlobalParticleData::ParticleReferenceType pos, const float3& impulse) { float scale = -pos.mPtr[3]; atomicAdd(pos.mPtr + 0, impulse.x * scale); atomicAdd(pos.mPtr + 1, impulse.y * scale); atomicAdd(pos.mPtr + 2, impulse.z * scale); } template __device__ void applyWind(CurrentT& current, PreviousT& previous) { const float dragCoefficient = gFrameData.mDragCoefficient; const float liftCoefficient = gFrameData.mLiftCoefficient; const float fluidDensity = gFrameData.mFluidDensity; const float itrDt = gFrameData.mIterDt; if (dragCoefficient == 0.0f && liftCoefficient == 0.0f) return; ProfileDetailZone zone(cloth::CuProfileZoneIds::WIND); const float oneThird = 1 / 3.0f; float3 wind = make_float3(gIterData.mWind[0], gIterData.mWind[1], gIterData.mWind[2]); const uint16_t* tIt = gClothData.mTriangles; for (int32_t i = threadIdx.x; i < gClothData.mNumTriangles; i += blockDim.x) { uint16_t i0 = tIt[i * 3 + 0]; uint16_t i1 = tIt[i * 3 + 1]; uint16_t i2 = tIt[i * 3 + 2]; float3 c0 = current(i0); float3 c1 = current(i1); float3 c2 = current(i2); // float w1 = current(i0, 3); // float w2 = current(i1, 3); // float w2 = current(i2, 3); // // float wMult = w1 * w2 * w3; // float invMass = wMult < FLT_EPSILON ? 0.f : w1 * w2 * w3 / (w1 * w2 + w1 * w3 + w2 * w3); float3 p0 = previous(i0); float3 p1 = previous(i1); float3 p2 = previous(i2); float3 cur = oneThird * (c0 + c1 + c2); float3 prev = oneThird * (p0 + p1 + p2); float3 delta = cur - prev + wind; if (IsTurning) { const float3* rot = reinterpret_cast(gFrameData.mRotation); float3 d = wind - prev; delta = cur + d.x * rot[0] + d.y * rot[1] + d.z * rot[2]; } float3 normal = cross3(c2 - c0, c1 - c0); const float doubleArea = sqrtf(dot3(normal, normal)); normal = (1.0f / doubleArea) * normal; float invSqrScale = dot3(delta, delta); float scale = rsqrtf(invSqrScale); float deltaLength = sqrtf(invSqrScale); float cosTheta = dot3(normal, delta) * scale; float sinTheta = sqrtf(max(0.0f, 1.0f - cosTheta * cosTheta)); float3 liftDir = cross3(cross3(delta, normal), scale * delta); float3 lift = liftCoefficient * cosTheta * sinTheta * ((deltaLength / itrDt) * liftDir); float3 drag = dragCoefficient * abs(cosTheta) * ((deltaLength / itrDt) * delta); float3 impulse = invSqrScale < FLT_EPSILON ? make_float3(0.0f, 0.0f, 0.0f) : fluidDensity * doubleArea * (lift + drag); applyImpulse(current(i0), impulse); applyImpulse(current(i1), impulse); applyImpulse(current(i2), impulse); } __syncthreads(); } template __device__ void applyWind(CurrentT& current, PreviousT& previous) { if (gIterData.mIsTurning) applyWind(current, previous); else applyWind(current, previous); } template __device__ void constrainTether(CurrentT& current) { if (0.0f == gFrameData.mTetherConstraintStiffness || !gClothData.mNumTethers) return; ProfileDetailZone zone(cloth::CuProfileZoneIds::TETHER); int32_t numParticles = gClothData.mNumParticles; int32_t numTethers = gClothData.mNumTethers; assert(0 == numTethers % numParticles); float stiffness = numParticles * __fdividef(gFrameData.mTetherConstraintStiffness, numTethers); float scale = gClothData.mTetherConstraintScale; const uint32_t* __restrict tIt = reinterpret_cast(gClothData.mTethers); for (int32_t i = threadIdx.x; i < numParticles; i += blockDim.x) { float posX = current(i, 0); float posY = current(i, 1); float posZ = current(i, 2); float offsetX = 0.0f; float offsetY = 0.0f; float offsetZ = 0.0f; for (int32_t j = i; j < numTethers; j += gClothData.mNumParticles) { uint32_t tether = __ldg(tIt + j); int32_t anchor = tether & 0xffff; float deltaX = current(anchor, 0) - posX; float deltaY = current(anchor, 1) - posY; float deltaZ = current(anchor, 2) - posZ; float sqrLength = FLT_EPSILON + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; float radius = (tether >> 16) * scale; float slack = 1.0f - radius * rsqrtf(sqrLength); if (slack > 0.0f) { offsetX += deltaX * slack; offsetY += deltaY * slack; offsetZ += deltaZ * slack; } } current(i, 0) = posX + offsetX * stiffness; current(i, 1) = posY + offsetY * stiffness; current(i, 2) = posZ + offsetZ * stiffness; } } template __device__ void solveFabric(CurrentT& current) { ProfileDetailZone zone(cloth::CuProfileZoneIds::FABRIC); const cloth::CuPhaseConfig* __restrict cIt = (cloth::CuPhaseConfig*)gSharedMemory; const cloth::CuPhaseConfig* cEnd = cIt + gClothData.mNumPhases; for (; cIt != cEnd; ++cIt) { __syncthreads(); ProfileDetailZone zone(cloth::CuProfileZoneIds::CONSTRAINT_SET); int32_t numConstraints = cIt->mNumConstraints; if (threadIdx.x >= numConstraints) continue; const uint32_t* __restrict iIt = reinterpret_cast(cIt->mIndices) + threadIdx.x; const float* restvalues = cIt->mRestvalues; const float* rIt = restvalues + threadIdx.x; const float* rEnd = restvalues + numConstraints; const float* stIt = cIt->mStiffnessValues + threadIdx.x; bool useStiffnessPerConstraint = cIt->mStiffnessValues!=nullptr; uint32_t vpijPrefetch = __ldg(iIt); float rijPrefetch = __ldg(rIt); float stijPrefetch; if (useStiffnessPerConstraint) stijPrefetch = __ldg(stIt); float stiffness = cIt->mStiffness; float stiffnessMultiplier = cIt->mStiffnessMultiplier; float compressionLimit = cIt->mCompressionLimit; float stretchLimit = cIt->mStretchLimit; do { rIt += blockDim.x; iIt += blockDim.x; stIt += blockDim.x; int32_t vpi = USHRT_MAX & vpijPrefetch; int32_t vpj = USHRT_MAX & vpijPrefetch >> 16; float rij = rijPrefetch; float stij = useStiffnessPerConstraint?1.0f - exp2f(stijPrefetch * gFrameData.mStiffnessExponent):stiffness; if (rIt < rEnd) { vpijPrefetch = __ldg(iIt); rijPrefetch = __ldg(rIt); if (useStiffnessPerConstraint) stijPrefetch = __ldg(stIt); } float vxi = current(vpi, 0); float vyi = current(vpi, 1); float vzi = current(vpi, 2); float vwi = current(vpi, 3); float vxj = current(vpj, 0); float vyj = current(vpj, 1); float vzj = current(vpj, 2); float vwj = current(vpj, 3); float hxij = vxj - vxi; float hyij = vyj - vyi; float hzij = vzj - vzi; float e2ij = FLT_EPSILON + hxij * hxij + hyij * hyij + hzij * hzij; float negErij = rij > FLT_EPSILON ? -1.0f + rij * rsqrtf(e2ij) : 0.0f; negErij = negErij + stiffnessMultiplier * max(compressionLimit, min(-negErij, stretchLimit)); float negExij = __fdividef(negErij * stij, FLT_EPSILON + vwi + vwj); float vmi = -vwi * negExij; current(vpi, 0) = vxi + vmi * hxij; current(vpi, 1) = vyi + vmi * hyij; current(vpi, 2) = vzi + vmi * hzij; float vmj = +vwj * negExij; current(vpj, 0) = vxj + vmj * hxij; current(vpj, 1) = vyj + vmj * hyij; current(vpj, 2) = vzj + vmj * hzij; } while (rIt < rEnd); } __syncthreads(); } template __device__ void constrainMotion(CurrentT& current, float alpha) { if (!gFrameData.mStartMotionConstraints) return; ProfileDetailZone zone(cloth::CuProfileZoneIds::MOTION); // negative because of fused multiply-add optimization float negativeScale = -gClothData.mMotionConstraintScale; float negativeBias = -gClothData.mMotionConstraintBias; const float4* startIt = reinterpret_cast(gFrameData.mStartMotionConstraints); const float4* targetIt = reinterpret_cast(gFrameData.mTargetMotionConstraints); for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) { float4 startPos = __ldg(startIt + i); float4 targetPos = __ldg(targetIt + i); float sphereX = startPos.x + (targetPos.x - startPos.x) * alpha; float sphereY = startPos.y + (targetPos.y - startPos.y) * alpha; float sphereZ = startPos.z + (targetPos.z - startPos.z) * alpha; float sphereW = startPos.w + (targetPos.w - startPos.w) * alpha; float dx = sphereX - current(i, 0); float dy = sphereY - current(i, 1); float dz = sphereZ - current(i, 2); float sqrLength = FLT_EPSILON + dx * dx + dy * dy + dz * dz; float negativeRadius = min(0.0f, sphereW * negativeScale + negativeBias); float slack = max(negativeRadius * rsqrtf(sqrLength) + 1.0f, 0.0f) * gFrameData.mMotionConstraintStiffness; current(i, 0) += slack * dx; current(i, 1) += slack * dy; current(i, 2) += slack * dz; // set invMass to zero if radius is zero if (negativeRadius >= 0.0f) current(i, 3) = 0.0f; } } template __device__ void constrainSeparation(T& current, float alpha) { if (!gFrameData.mStartSeparationConstraints) return; ProfileDetailZone zone(cloth::CuProfileZoneIds::SEPARATION); const float4* startIt = reinterpret_cast(gFrameData.mStartSeparationConstraints); const float4* targetIt = reinterpret_cast(gFrameData.mTargetSeparationConstraints); for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) { float4 startPos = __ldg(startIt + i); float4 targetPos = __ldg(targetIt + i); float sphereX = startPos.x + (targetPos.x - startPos.x) * alpha; float sphereY = startPos.y + (targetPos.y - startPos.y) * alpha; float sphereZ = startPos.z + (targetPos.z - startPos.z) * alpha; float sphereW = startPos.w + (targetPos.w - startPos.w) * alpha; float dx = sphereX - current(i, 0); float dy = sphereY - current(i, 1); float dz = sphereZ - current(i, 2); float sqrLength = FLT_EPSILON + dx * dx + dy * dy + dz * dz; float slack = min(0.0f, 1.0f - sphereW * rsqrtf(sqrLength)); current(i, 0) += slack * dx; current(i, 1) += slack * dy; current(i, 2) += slack * dz; } } template __device__ void updateSleepState(const CurrentT& current, const PreviousT& previous) { ProfileDetailZone zone(cloth::CuProfileZoneIds::SLEEP); if (!threadIdx.x) gFrameData.mSleepTestCounter += max(1, uint32_t(gFrameData.mIterDt * 1000)); __syncthreads(); if (gFrameData.mSleepTestCounter < gClothData.mSleepTestInterval) return; float maxDelta = 0.0f; for (int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) { float4 prev = previous(i); maxDelta = max(fabsf(current(i, 0) - prev.x), maxDelta); maxDelta = max(fabsf(current(i, 1) - prev.y), maxDelta); maxDelta = max(fabsf(current(i, 2) - prev.z), maxDelta); } if (!threadIdx.x) { ++gFrameData.mSleepPassCounter; gFrameData.mSleepTestCounter -= gClothData.mSleepTestInterval; } __syncthreads(); if (maxDelta > gClothData.mSleepThreshold * gFrameData.mIterDt) gFrameData.mSleepPassCounter = 0; } template __device__ void simulateCloth(CurrentT& current, PreviousT& previous) { // apply exponent to phase configs assert(blockDim.x >= gClothData.mNumPhases); if (threadIdx.x < gClothData.mNumPhases) { float exponent = gFrameData.mStiffnessExponent; float* ptr = gSharedMemory + threadIdx.x * gCuPhaseConfigSize; ptr[0] = 1.0f - exp2f(ptr[0] * exponent); ptr[1] = 1.0f - exp2f(ptr[1] * exponent); } uint32_t numIterations = gFrameData.mNumIterations; float invNumIterations = __fdividef(1.0f, numIterations); const cloth::CuIterationData* iterData = gFrameData.mIterationData; const cloth::CuIterationData* iterEnd = iterData + numIterations; loadIterData(iterData); __syncthreads(); for (float alpha = invNumIterations; iterData++ != iterEnd; alpha += invNumIterations) { integrateParticles(current, previous); accelerateParticles(current); applyWind(current, previous); constrainMotion(current, alpha); constrainTether(current); solveFabric(current); loadIterData(iterData); constrainSeparation(current, alpha); gCollideParticles.get()(current, previous, alpha); gSelfCollideParticles.get()(current); updateSleepState(current, previous); } __syncthreads(); } __device__ void simulateShared() { ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE_SHARED); __shared__ uninitialized current; __shared__ uninitialized previous; int32_t configDataSize = gClothData.mNumPhases * gCuPhaseConfigSize; int32_t particlesDataSize = 4 * gClothData.mNumParticles; Pointer sharedCurPos = Pointer(gSharedMemory + configDataSize + threadIdx.x % 4 * gClothData.mNumParticles); Pointer sharedPrevPos = sharedCurPos + particlesDataSize; if (threadIdx.x < 4) { current.get().mPointers[threadIdx.x] = sharedCurPos; previous.get().mPointers[threadIdx.x] = sharedPrevPos; } float* globalCurPos = gClothData.mParticles; float* globalPrevPos = gClothData.mParticles + particlesDataSize; // copy particles from device memory to shared memory and transpose for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x) { sharedCurPos[i / 4] = globalCurPos[i]; sharedPrevPos[i / 4] = globalPrevPos[i]; } simulateCloth(current.get(), previous.get()); // copy particles from shared memory to device memory and transpose for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x) { globalCurPos[i] = sharedCurPos[i / 4]; globalPrevPos[i] = sharedPrevPos[i / 4]; } __syncthreads(); } __device__ void simulateStreamed() { ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE_STREAMED); __shared__ uninitialized current; __shared__ uninitialized previous; int32_t configDataSize = gClothData.mNumPhases * gCuPhaseConfigSize; int32_t particlesDataSize = 4 * gClothData.mNumParticles; float* globalCurPos = gClothData.mParticles; Pointer sharedCurPos = Pointer(gSharedMemory + configDataSize + threadIdx.x % 4 * gClothData.mNumParticles); if (threadIdx.x < 4) current.get().mPointers[threadIdx.x] = sharedCurPos; if (!threadIdx.x) previous.get().mPtr = GlobalParticleData::PointerType(globalCurPos + particlesDataSize); // copy particles from device memory to shared memory and transpose for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x) sharedCurPos[i / 4] = globalCurPos[i]; simulateCloth(current.get(), previous.get()); // copy particles from shared memory to device memory and transpose for (int32_t i = threadIdx.x; i < particlesDataSize; i += blockDim.x) globalCurPos[i] = sharedCurPos[i / 4]; __syncthreads(); } __device__ void simulateGlobal() { ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE_GLOBAL); __shared__ uninitialized current; __shared__ uninitialized previous; if (!threadIdx.x) { GlobalParticleData::PointerType globalCurPos(gClothData.mParticles); current.get().mPtr = globalCurPos; previous.get().mPtr = globalCurPos + gClothData.mNumParticles; } simulateCloth(current.get(), previous.get()); } } // anonymous namespace extern "C" __global__ void #if __CUDA_ARCH__ >= 300 __launch_bounds__(1024, 1) #else __launch_bounds__(512, 1) #endif simulateCloths(cloth::CuKernelData kernelData) { gProfileBuffer = kernelData.mProfileBuffer; gProfileBaseId = kernelData.mProfileBaseId; ProfileZone zone(cloth::CuProfileZoneIds::SIMULATE); // check that http://nvbugs/1038473 is fixed assert(gSharedMemory > (float*)&gFrameData); assert(gSharedMemory > (float*)&gClothData); // fetch cloth index from queue __shared__ uint32_t clothIdx; if (!threadIdx.x) clothIdx = atomicInc(kernelData.mClothIndex, gridDim.x - 1); __syncthreads(); assert(clothIdx < gridDim.x); // copy cloth data to shared memory const uint32_t* clothData = reinterpret_cast(kernelData.mClothData + clothIdx); if (threadIdx.x < gCuClothDataSize) reinterpret_cast(&gClothData)[threadIdx.x] = clothData[threadIdx.x]; // copy frame data to shared memory uint32_t* frameData = reinterpret_cast(kernelData.mFrameData + clothIdx); if (threadIdx.x < gCuFrameDataSize) reinterpret_cast(&gFrameData)[threadIdx.x] = frameData[threadIdx.x]; __syncthreads(); if (gFrameData.mSleepPassCounter >= gClothData.mSleepAfterCount) return; // cloth is sleeping, exit // copy phase configs to shared memory int32_t configDataSize = gClothData.mNumPhases * gCuPhaseConfigSize; for (int32_t i = threadIdx.x; i < configDataSize; i += blockDim.x) gSharedUnsigned[i] = reinterpret_cast(gClothData.mPhaseConfigs)[i]; Pointer scratchPtr = Pointer( gSharedUnsigned + configDataSize + 4 * gFrameData.mNumSharedPositions * gClothData.mNumParticles); // initialize with placement new new (gCollideParticles.data) CuCollision(scratchPtr); new (gSelfCollideParticles.data) CuSelfCollision(); // copy particles and constraints to device if (gFrameData.mDeviceParticlesDirty) { for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 8; i += blockDim.x) gClothData.mParticles[i] = gClothData.mParticlesHostCopy[i]; } if (gFrameData.mHostMotionConstraints) { for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x) gFrameData.mTargetMotionConstraints[i] = gFrameData.mHostMotionConstraints[i]; } if (gFrameData.mHostSeparationConstraints) { for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x) gFrameData.mTargetSeparationConstraints[i] = gFrameData.mHostSeparationConstraints[i]; } if (gFrameData.mHostParticleAccelerations) { for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 4; i += blockDim.x) gFrameData.mParticleAccelerations[i] = gFrameData.mHostParticleAccelerations[i]; } // necessary to ensure phase configs are fully loaded before setup in simulateCloth() __syncthreads(); switch(gFrameData.mNumSharedPositions) { case 0: simulateGlobal(); break; case 1: simulateStreamed(); break; case 2: simulateShared(); break; } // write back frame data if (threadIdx.x < gCuFrameDataSize) frameData[threadIdx.x] = reinterpret_cast(&gFrameData)[threadIdx.x]; // copy particles to host for (int32_t i = threadIdx.x; i < gClothData.mNumParticles * 8; i += blockDim.x) gClothData.mParticlesHostCopy[i] = gClothData.mParticles[i]; } const char* cloth::getKernelFunctionName() { return "simulateCloths"; }