diff options
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src/windows')
22 files changed, 5851 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h new file mode 100644 index 00000000..4f02de76 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/ClothClone.h @@ -0,0 +1,225 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxMemory.h" + +#include "SwFactory.h" +#include "SwFabric.h" +#include "SwCloth.h" + +#include "ClothImpl.h" +#include "ClothBase.h" + +namespace physx +{ +namespace cloth +{ +class DxFactory; +class CuFactory; + +// make range from vector +template <typename T, typename A> +Range<T> makeRange(shdfnd::Array<T, A>& vec) +{ + T* ptr = vec.empty() ? 0 : vec.begin(); + return Range<T>(ptr, ptr + vec.size()); +} + +template <typename T, typename A> +Range<const T> makeRange(const shdfnd::Array<T, A>& vec) +{ + const T* ptr = vec.empty() ? 0 : vec.begin(); + return Range<const T>(ptr, ptr + vec.size()); +} + +// fabric conversion +template <typename SrcClothType, typename DstFactoryType> +typename DstFactoryType::FabricType* convertFabric(const SrcClothType& srcFabric, DstFactoryType& dstFactory) +{ + typedef typename DstFactoryType::FabricType DstFabricType; + + // see if dstFactory already has a Fabric with this id + DstFabricType* const* fIt = dstFactory.mFabrics.begin(); + DstFabricType* const* fEnd = dstFactory.mFabrics.end(); + for(; fIt != fEnd; ++fIt) + if((*fIt)->mId == srcFabric.mId) + return *fIt; // found id, return existing fabric + + // fabric does not exist so create a new one + Vector<uint32_t>::Type phases(srcFabric.getNumPhases()); + Vector<uint32_t>::Type sets(srcFabric.getNumSets()); + Vector<float>::Type restvalues(srcFabric.getNumRestvalues()); + Vector<uint32_t>::Type indices(srcFabric.getNumIndices()); + Vector<uint32_t>::Type anchors(srcFabric.getNumTethers()); + Vector<float>::Type tetherLengths(srcFabric.getNumTethers()); + Vector<uint32_t>::Type triangles(srcFabric.getNumTriangles() * 3); + + Range<uint32_t> phaseRange = makeRange(phases); + Range<float> restvalueRange = makeRange(restvalues); + Range<uint32_t> setRange = makeRange(sets); + Range<uint32_t> indexRange = makeRange(indices); + Range<uint32_t> anchorRange = makeRange(anchors); + Range<float> lengthRange = makeRange(tetherLengths); + Range<uint32_t> triangleRange = makeRange(triangles); + + srcFabric.mFactory.extractFabricData(srcFabric, phaseRange, setRange, restvalueRange, indexRange, anchorRange, + lengthRange, triangleRange); + + DstFabricType* dstFabric = + static_cast<DstFabricType*>(dstFactory.createFabric(srcFabric.mNumParticles, phaseRange, setRange, restvalueRange, + indexRange, anchorRange, lengthRange, triangleRange)); + + // give new fabric the same id as the source so it can be matched + dstFabric->mId = srcFabric.mId; + + return dstFabric; +} + +inline Range<const PhaseConfig> getPhaseConfigs(const SwCloth& cloth) +{ + return makeRange(cloth.mPhaseConfigs); +} +inline void setPhaseConfigs(SwCloth& cloth, Range<const PhaseConfig> phaseConfigs) +{ + cloth.mPhaseConfigs.assign(phaseConfigs.begin(), phaseConfigs.end()); +} +inline Range<const PxVec4> getParticleAccelerations(const SwCloth& cloth) +{ + return makeRange(cloth.mParticleAccelerations); +} +inline Range<const uint32_t> getSelfCollisionIndices(const SwCloth& cloth) +{ + return makeRange(cloth.mSelfCollisionIndices); +} + +// cloth conversion +template <typename DstFactoryType, typename SrcImplType> +typename DstFactoryType::ImplType* convertCloth(DstFactoryType& dstFactory, const SrcImplType& srcImpl) +{ + typedef typename DstFactoryType::FabricType DstFabricType; + typedef typename DstFactoryType::ImplType DstImplType; + typedef typename DstImplType::ClothType DstClothType; + typedef typename SrcImplType::ClothType SrcClothType; + + const SrcClothType& srcCloth = srcImpl.mCloth; + const Factory& srcFactory = srcCloth.mFactory; + + typename DstClothType::ContextLockType dstLock(dstFactory); + typename SrcClothType::ContextLockType srcLock(srcCloth.mFactory); + + // particles + MappedRange<const PxVec4> curParticles = srcImpl.getCurrentParticles(); + + // fabric + DstFabricType& dstFabric = *convertFabric(srcCloth.mFabric, dstFactory); + + // create new cloth + DstImplType* dstImpl = static_cast<DstImplType*>(dstFactory.createCloth(curParticles, dstFabric)); + DstClothType& dstCloth = dstImpl->mCloth; + + // copy across common parameters + copy(dstCloth, srcCloth); + + // copy across previous particles + MappedRange<const PxVec4> prevParticles = srcImpl.getPreviousParticles(); + PxMemCopy(dstImpl->getPreviousParticles().begin(), prevParticles.begin(), prevParticles.size() * sizeof(PxVec4)); + + // copy across transformed phase configs + setPhaseConfigs(dstCloth, getPhaseConfigs(srcCloth)); + + // collision data + Vector<PxVec4>::Type spheres(srcImpl.getNumSpheres(), PxVec4(0.0f)); + PxVec4* spherePtr = spheres.empty() ? 0 : &spheres.front(); + Range<PxVec4> sphereRange(spherePtr, spherePtr + spheres.size()); + Vector<uint32_t>::Type capsules(srcImpl.getNumCapsules() * 2); + Range<uint32_t> capsuleRange = makeRange(capsules); + Vector<PxVec4>::Type planes(srcImpl.getNumPlanes(), PxVec4(0.0f)); + PxVec4* planePtr = planes.empty() ? 0 : &planes.front(); + Range<PxVec4> planeRange(planePtr, planePtr + planes.size()); + Vector<uint32_t>::Type convexes(srcImpl.getNumConvexes()); + Range<uint32_t> convexRange = makeRange(convexes); + Vector<PxVec3>::Type triangles(srcImpl.getNumTriangles() * 3, PxVec3(0.0f)); + PxVec3* trianglePtr = triangles.empty() ? 0 : &triangles.front(); + Range<PxVec3> triangleRange(trianglePtr, trianglePtr + triangles.size()); + + srcFactory.extractCollisionData(srcImpl, sphereRange, capsuleRange, planeRange, convexRange, triangleRange); + dstImpl->setSpheres(sphereRange, 0, 0); + dstImpl->setCapsules(capsuleRange, 0, 0); + dstImpl->setPlanes(planeRange, 0, 0); + dstImpl->setConvexes(convexRange, 0, 0); + dstImpl->setTriangles(triangleRange, 0, 0); + + // motion constraints, copy directly into new cloth buffer + if(srcImpl.getNumMotionConstraints()) + srcFactory.extractMotionConstraints(srcImpl, dstImpl->getMotionConstraints()); + + // separation constraints, copy directly into new cloth buffer + if(srcImpl.getNumSeparationConstraints()) + srcFactory.extractSeparationConstraints(srcImpl, dstImpl->getSeparationConstraints()); + + // particle accelerations + if(srcImpl.getNumParticleAccelerations()) + { + Range<const PxVec4> accelerations = getParticleAccelerations(srcCloth); + PxMemCopy(dstImpl->getParticleAccelerations().begin(), accelerations.begin(), + accelerations.size() * sizeof(PxVec4)); + } + + // self-collision indices + dstImpl->setSelfCollisionIndices(getSelfCollisionIndices(srcCloth)); + + // rest positions + Vector<PxVec4>::Type restPositions(srcImpl.getNumRestPositions()); + srcFactory.extractRestPositions(srcImpl, makeRange(restPositions)); + dstImpl->setRestPositions(makeRange(restPositions)); + + // virtual particles + if(srcImpl.getNumVirtualParticles()) + { + Vector<Vec4u>::Type indices(srcImpl.getNumVirtualParticles()); + Vector<PxVec3>::Type weights(srcImpl.getNumVirtualParticleWeights(), PxVec3(0.0f)); + + uint32_t(*indicesPtr)[4] = indices.empty() ? 0 : &array(indices.front()); + Range<uint32_t[4]> indicesRange(indicesPtr, indicesPtr + indices.size()); + + PxVec3* weightsPtr = weights.empty() ? 0 : &weights.front(); + Range<PxVec3> weightsRange(weightsPtr, weightsPtr + weights.size()); + + srcFactory.extractVirtualParticles(srcImpl, indicesRange, weightsRange); + + dstImpl->setVirtualParticles(indicesRange, weightsRange); + } + + return dstImpl; +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h new file mode 100644 index 00000000..b9ae0a53 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCheckSuccess.h @@ -0,0 +1,45 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <cuda.h> +#include <driver_types.h> + +namespace physx +{ +namespace cloth +{ +// implemented in CuFactory.cpp +void checkSuccessImpl(CUresult, const char*, const int); +} + +// safe cuda calls +#define checkSuccess(err) cloth::checkSuccessImpl(err, __FILE__, __LINE__) +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp new file mode 100644 index 00000000..6ecd1aeb --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.cpp @@ -0,0 +1,511 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "CuCloth.h" +#include "CuFabric.h" +#include "CuFactory.h" +#include "CuContextLock.h" +#include "CuCheckSuccess.h" +#include "CuClothData.h" +#include "CuSolver.h" +#include "TripletScheduler.h" +#include "ClothBase.h" +#include "Array.h" +#include "PsFoundation.h" + +#if PX_VC +#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch +#endif + +namespace physx +{ +namespace cloth +{ +PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp +} +} + +using namespace physx; + +namespace +{ +bool isSelfCollisionEnabled(const cloth::CuCloth& cloth) +{ + return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f; +} +} + +cloth::CuCloth::CuCloth(CuFactory& factory, CuFabric& fabric, Range<const PxVec4> particles) +: CuContextLock(factory) +, mFactory(factory) +, mFabric(fabric) +, mClothDataDirty(false) +, mNumParticles(uint32_t(particles.size())) +, mParticles(mFactory.mContextManager) +, mParticlesHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped)) +, mDeviceParticlesDirty(false) +, mHostParticlesDirty(true) +, mPhaseConfigs(mFactory.mContextManager) +, mMotionConstraints(mFactory.mContextManager) +, mSeparationConstraints(mFactory.mContextManager) +, mParticleAccelerations(mFactory.mContextManager) +, mParticleAccelerationsHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocMapped)) +, mCapsuleIndices(getMappedAllocator<IndexPair>(mFactory.mContextManager)) +, mStartCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mTargetCollisionSpheres(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mConvexMasks(getMappedAllocator<uint32_t>(mFactory.mContextManager)) +, mStartCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mTargetCollisionPlanes(getMappedAllocator<PxVec4>(mFactory.mContextManager)) +, mStartCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager)) +, mTargetCollisionTriangles(getMappedAllocator<PxVec3>(mFactory.mContextManager)) +, mVirtualParticleSetSizes(mFactory.mContextManager) +, mVirtualParticleIndices(mFactory.mContextManager) +, mVirtualParticleWeights(mFactory.mContextManager) +, mRestPositions(mFactory.mContextManager) +, mSelfCollisionIndices(mFactory.mContextManager) +, mSelfCollisionData(mFactory.mContextManager) +, mSharedMemorySize(0) +, mUserData(0) +{ + PX_ASSERT(!particles.empty()); + + initialize(*this, particles.begin(), particles.end()); + + mParticles.reserve(2 * mNumParticles); + mParticles.push_back(particles.begin(), particles.end()); + mParticles.push_back(particles.begin(), particles.end()); + mParticlesHostCopy.resizeUninitialized(2 * mNumParticles); + + mFabric.incRefCount(); + + CuContextLock::release(); +} + +cloth::CuCloth::CuCloth(CuFactory& factory, const CuCloth& cloth) +: CuContextLock(factory) +, mFactory(factory) +, mFabric(cloth.mFabric) +, mNumParticles(cloth.mNumParticles) +, mParticles(cloth.mParticles) +, mParticlesHostCopy(cloth.mParticlesHostCopy) +, mDeviceParticlesDirty(cloth.mDeviceParticlesDirty) +, mHostParticlesDirty(cloth.mHostParticlesDirty) +, mPhaseConfigs(cloth.mPhaseConfigs) +, mHostPhaseConfigs(cloth.mHostPhaseConfigs) +, mMotionConstraints(cloth.mMotionConstraints) +, mSeparationConstraints(cloth.mSeparationConstraints) +, mParticleAccelerations(cloth.mParticleAccelerations) +, mParticleAccelerationsHostCopy(cloth.mParticleAccelerationsHostCopy) +, mCapsuleIndices(cloth.mCapsuleIndices) +, mStartCollisionSpheres(cloth.mStartCollisionSpheres) +, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres) +, mStartCollisionPlanes(cloth.mStartCollisionPlanes) +, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes) +, mStartCollisionTriangles(cloth.mStartCollisionTriangles) +, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles) +, mVirtualParticleSetSizes(cloth.mVirtualParticleSetSizes) +, mVirtualParticleIndices(cloth.mVirtualParticleIndices) +, mVirtualParticleWeights(cloth.mVirtualParticleWeights) +, mRestPositions(cloth.mRestPositions) +, mSelfCollisionIndices(cloth.mSelfCollisionIndices) +, mSelfCollisionData(mFactory.mContextManager) +, mSharedMemorySize(cloth.mSharedMemorySize) +, mUserData(cloth.mUserData) +{ + copy(*this, cloth); + + mFabric.incRefCount(); + + CuContextLock::release(); +} + +cloth::CuCloth::~CuCloth() +{ + CuContextLock::acquire(); + + mFabric.decRefCount(); +} + +void cloth::CuCloth::notifyChanged() +{ + mClothDataDirty = true; +} + +bool cloth::CuCloth::updateClothData(CuClothData& clothData) +{ + // test particle pointer to detect when cloth data array has been reordered + if(!mClothDataDirty && clothData.mParticles == array(*mParticles.begin().get())) + { + PX_ASSERT(mSharedMemorySize == getSharedMemorySize()); + return false; + } + + mSharedMemorySize = getSharedMemorySize(); + + if(mSelfCollisionData.empty() && isSelfCollisionEnabled(*this)) + { + uint32_t numSelfCollisionIndices = + mSelfCollisionIndices.empty() ? mNumParticles : uint32_t(mSelfCollisionIndices.size()); + + uint32_t particleSize = 4 * mNumParticles; + uint32_t keySize = 2 * numSelfCollisionIndices; // 2x for radix buffer + uint32_t cellStartSize = (129 + 128 * 128 + 130) / 2 + 1; // half because type is int16_t + + // use 16bit indices for cellStart array (128x128 grid) + mSelfCollisionData.resize(particleSize + keySize + cellStartSize); + checkSuccess(cuMemsetD32((mSelfCollisionData.begin() + particleSize + keySize).dev(), 0xffffffff, cellStartSize)); + } + + clothData = CuClothData(*this); + mClothDataDirty = false; + + return true; +} + +uint32_t cloth::CuCloth::getSharedMemorySize() const +{ + uint32_t numPhases = uint32_t(mPhaseConfigs.size()); + uint32_t numSpheres = uint32_t(mStartCollisionSpheres.size()); + uint32_t numCones = uint32_t(mCapsuleIndices.size()); + uint32_t numPlanes = uint32_t(mStartCollisionPlanes.size()); + uint32_t numConvexes = uint32_t(mConvexMasks.size()); + uint32_t numTriangles = uint32_t(mStartCollisionTriangles.size() / 3); + + uint32_t phaseConfigSize = numPhases * sizeof(CuPhaseConfig); + + bool storePrevCollisionData = mEnableContinuousCollision || mFriction > 0.0f; + uint32_t continuousCollisionSize = storePrevCollisionData ? 4 * numSpheres + 10 * numCones : 0; + continuousCollisionSize += 4 * numCones + numConvexes; // capsule and convex masks + uint32_t discreteCollisionSize = 4 * numSpheres + PxMax(10 * numCones + 96, 208u); + discreteCollisionSize = PxMax(discreteCollisionSize, PxMax(4 * numPlanes, 19 * numTriangles)); + + // scratch memory for prefix sum and histogram + uint32_t selfCollisionSize = isSelfCollisionEnabled(*this) ? 544 : 0; + + // see CuSolverKenel.cu::gSharedMemory comment for details + return phaseConfigSize + sizeof(float) * (continuousCollisionSize + PxMax(selfCollisionSize, discreteCollisionSize)); +} + +void cloth::CuCloth::setPhaseConfig(Range<const PhaseConfig> configs) +{ + mHostPhaseConfigs.assign(configs.begin(), configs.end()); + + Vector<CuPhaseConfig>::Type deviceConfigs; + deviceConfigs.reserve(configs.size()); + const PhaseConfig* cEnd = configs.end(); + for(const PhaseConfig* cIt = configs.begin(); cIt != cEnd; ++cIt) + { + CuPhaseConfig config; + + config.mStiffness = cIt->mStiffness; + config.mStiffnessMultiplier = cIt->mStiffnessMultiplier; + config.mCompressionLimit = cIt->mCompressionLimit; + config.mStretchLimit = cIt->mStretchLimit; + + uint16_t phaseIndex = cIt->mPhaseIndex; + config.mNumConstraints = mFabric.mNumConstraintsInPhase[phaseIndex]; + config.mRestvalues = mFabric.mRestvaluesInPhase[phaseIndex].get(); + config.mIndices = mFabric.mIndicesInPhase[phaseIndex].get(); + + deviceConfigs.pushBack(config); + } + + CuContextLock contextLock(mFactory); + mPhaseConfigs.assign(deviceConfigs.begin(), deviceConfigs.end()); +} + +cloth::Range<PxVec4> cloth::CuCloth::push(cloth::CuConstraints& constraints) +{ + if(!constraints.mTarget.capacity()) + { + CuContextLock contextLock(mFactory); + constraints.mTarget.reserve(mNumParticles); + } + if(constraints.mHostCopy.empty()) + constraints.mTarget.resize(mNumParticles); + + if(constraints.mStart.empty()) // initialize start first + constraints.mStart.swap(constraints.mTarget); + + if(!constraints.mHostCopy.capacity()) + { + CuContextLock contextLock(mFactory); + constraints.mHostCopy.reserve(mNumParticles); + } + constraints.mHostCopy.resizeUninitialized(mNumParticles); + + PxVec4* data = &constraints.mHostCopy.front(); + return Range<PxVec4>(data, data + constraints.mHostCopy.size()); +} + +void cloth::CuCloth::clear(cloth::CuConstraints& constraints) +{ + CuContextLock contextLock(mFactory); + CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mStart); + CuDeviceVector<PxVec4>(mFactory.mContextManager).swap(constraints.mTarget); +} + +void cloth::CuCloth::syncDeviceParticles() +{ + if(mDeviceParticlesDirty) + { + CuContextLock contextLock(mFactory); + checkSuccess( + cuMemcpyHtoD(mParticles.begin().dev(), mParticlesHostCopy.begin(), 2 * mNumParticles * sizeof(PxVec4))); + mDeviceParticlesDirty = false; + } +} + +void cloth::CuCloth::syncHostParticles() +{ + if(mHostParticlesDirty) + { + CuContextLock contextLock(mFactory); + const PxVec4* src = mParticles.begin().get(); + mFactory.copyToHost(src, src + 2 * mNumParticles, mParticlesHostCopy.begin()); + mHostParticlesDirty = false; + } +} + +cloth::Range<const PxVec3> cloth::CuCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t replaceSize) +{ + // clamp to 500 triangles (1500 vertices) to prevent running out of shared memory + uint32_t removedSize = mStartCollisionTriangles.size() - replaceSize; + const PxVec3* clamp = range.begin() + 1500 - removedSize; + + if(range.end() > clamp) + { + shdfnd::getFoundation().error(PX_WARN, "Too many collision " + "triangles specified for cloth, dropping all but first 500.\n"); + } + + return Range<const PxVec3>(range.begin(), PxMin(range.end(), clamp)); +} + +#include "ClothImpl.h" + +namespace physx +{ +namespace cloth +{ + +// ClothImpl<CuCloth>::clone() implemented in CuClothClone.cpp + +template <> +uint32_t ClothImpl<CuCloth>::getNumParticles() const +{ + return mCloth.mNumParticles; +} + +template <> +void ClothImpl<CuCloth>::lockParticles() const +{ + const_cast<CuCloth&>(mCloth).syncHostParticles(); +} + +template <> +void ClothImpl<CuCloth>::unlockParticles() const +{ +} + +template <> +MappedRange<PxVec4> ClothImpl<CuCloth>::getCurrentParticles() +{ + mCloth.wakeUp(); + lockParticles(); + mCloth.mDeviceParticlesDirty = true; + return getMappedParticles(mCloth.mParticlesHostCopy.begin()); +} + +template <> +MappedRange<const PxVec4> ClothImpl<CuCloth>::getCurrentParticles() const +{ + lockParticles(); + return getMappedParticles(mCloth.mParticlesHostCopy.begin()); +} + +template <> +MappedRange<PxVec4> ClothImpl<CuCloth>::getPreviousParticles() +{ + mCloth.wakeUp(); + lockParticles(); + mCloth.mDeviceParticlesDirty = true; + return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles); +} + +template <> +MappedRange<const PxVec4> ClothImpl<CuCloth>::getPreviousParticles() const +{ + lockParticles(); + return getMappedParticles(mCloth.mParticlesHostCopy.begin() + mCloth.mNumParticles); +} + +template <> +GpuParticles ClothImpl<CuCloth>::getGpuParticles() +{ + mCloth.syncDeviceParticles(); + mCloth.mHostParticlesDirty = true; + PxVec4* particles = mCloth.mParticles.begin().get(); + GpuParticles result = { particles, particles + mCloth.mNumParticles, 0 }; + return result; +} + +template <> +void ClothImpl<CuCloth>::setPhaseConfig(Range<const PhaseConfig> configs) +{ + Vector<PhaseConfig>::Type transformedConfigs; + transformedConfigs.reserve(configs.size()); + + // transform phase config to use in solver + for(; !configs.empty(); configs.popFront()) + if(configs.front().mStiffness > 0.0f) + transformedConfigs.pushBack(transform(configs.front())); + + mCloth.setPhaseConfig(Range<const PhaseConfig>(transformedConfigs.begin(), transformedConfigs.end())); + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <> +void ClothImpl<CuCloth>::setSelfCollisionIndices(Range<const uint32_t> indices) +{ + ContextLockType lock(mCloth.mFactory); + mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end()); + mCloth.mSelfCollisionIndicesHost.assign(indices.begin(), indices.end()); + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +template <> +uint32_t ClothImpl<CuCloth>::getNumVirtualParticles() const +{ + return uint32_t(mCloth.mVirtualParticleIndices.size()); +} + +template <> +Range<PxVec4> ClothImpl<CuCloth>::getParticleAccelerations() +{ + if(mCloth.mParticleAccelerations.empty()) + { + CuContextLock contextLock(mCloth.mFactory); + mCloth.mParticleAccelerations.resize(mCloth.mNumParticles); + } + + if(!mCloth.mParticleAccelerationsHostCopy.capacity()) + { + CuContextLock contextLock(mCloth.mFactory); + mCloth.mParticleAccelerationsHostCopy.reserve(mCloth.mNumParticles); + } + mCloth.mParticleAccelerationsHostCopy.resizeUninitialized(mCloth.mNumParticles); + + mCloth.wakeUp(); + + PxVec4* data = mCloth.mParticleAccelerationsHostCopy.begin(); + return Range<PxVec4>(data, mCloth.mParticleAccelerationsHostCopy.end()); +} + +template <> +void ClothImpl<CuCloth>::clearParticleAccelerations() +{ + CuContextLock contextLock(mCloth.mFactory); + CuDeviceVector<PxVec4>(mCloth.mFactory.mContextManager).swap(mCloth.mParticleAccelerations); + mCloth.mParticleAccelerationsHostCopy.reset(); + mCloth.wakeUp(); +} + +namespace +{ +uint32_t calculateNumReplays(const Vector<Vec4u>::Type& triplets, const Vector<uint32_t>::Type setSizes) +{ + uint32_t result = 0; + + Vector<Vec4u>::Type::ConstIterator tIt = triplets.begin(); + Vector<uint32_t>::Type::ConstIterator sIt, sEnd = setSizes.end(); + uint32_t index = 0; + for(sIt = setSizes.begin(); sIt != sEnd; ++sIt, ++index) + { + Vector<Vec4u>::Type::ConstIterator tEnd = tIt + *sIt, tLast = tIt; + while(tLast != tEnd) + { + uint8_t numConflicts[3][32] = {}; + uint8_t numReplays[3] = {}; + + for(tLast += PxMin(ptrdiff_t(32), tEnd - tLast); tIt != tLast; ++tIt) + for(int i = 0; i < 3; ++i) + numReplays[i] = PxMax(numReplays[i], ++numConflicts[i][(*tIt)[i] & 31]); + + result += numReplays[0] + numReplays[1] + numReplays[2]; + } + } + + return result; +} +} + +template <> +void ClothImpl<CuCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights) +{ + // shuffle indices to form independent SIMD sets + TripletScheduler scheduler(indices); + scheduler.warp(mCloth.mNumParticles, 32); + + // convert to 16bit indices + Vector<Vec4us>::Type hostIndices; + hostIndices.reserve(indices.size()); + TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(); + TripletScheduler::ConstTripletIter tEnd = scheduler.mTriplets.end(); + for(; tIt != tEnd; ++tIt) + hostIndices.pushBack(Vec4us(*tIt)); + + // printf("num sets = %u, num replays = %u\n", scheduler.mSetSizes.size(), + // calculateNumReplays(scheduler.mTriplets, scheduler.mSetSizes)); + + // add normalization weight + Vector<PxVec4>::Type hostWeights; + hostWeights.reserve(weights.size()); + for(; !weights.empty(); weights.popFront()) + { + PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front()); + PxReal scale = 1 / w.magnitudeSquared(); + hostWeights.pushBack(PxVec4(w.x, w.y, w.z, scale)); + } + + CuContextLock contextLock(mCloth.mFactory); + + // todo: 'swap' these to force reallocation? + mCloth.mVirtualParticleIndices = hostIndices; + mCloth.mVirtualParticleSetSizes = scheduler.mSetSizes; + mCloth.mVirtualParticleWeights = hostWeights; + + mCloth.notifyChanged(); + mCloth.wakeUp(); +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h new file mode 100644 index 00000000..257d490c --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCloth.h @@ -0,0 +1,216 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxTransform.h" +#include "foundation/PxVec4.h" +#include "Range.h" +#include "PhaseConfig.h" +#include "MovingAverage.h" +#include "IndexPair.h" +#include "BoundingBox.h" +#include "Vec4T.h" +#include "CuPhaseConfig.h" +#include "CuPinnedAllocator.h" +#include "CuContextLock.h" +#include "CuDeviceVector.h" + +namespace physx +{ +namespace cloth +{ + +class CuFabric; +class CuFactory; +struct CuClothData; + +struct CuConstraints +{ + CuConstraints(physx::PxCudaContextManager* ctx) + : mStart(ctx), mTarget(ctx), mHostCopy(CuHostAllocator(ctx, cudaHostAllocMapped)) + { + } + + void pop() + { + if(!mTarget.empty()) + { + mStart.swap(mTarget); + mTarget.resize(0); + } + } + + CuDeviceVector<PxVec4> mStart; + CuDeviceVector<PxVec4> mTarget; + CuPinnedVector<PxVec4>::Type mHostCopy; +}; + +class CuCloth : protected CuContextLock +{ + public: + CuCloth& operator=(const CuCloth&); + typedef CuFactory FactoryType; + typedef CuFabric FabricType; + typedef CuContextLock ContextLockType; + + typedef CuPinnedVector<PxVec4>::Type& MappedVec4fVectorType; + typedef CuPinnedVector<IndexPair>::Type& MappedIndexVectorType; + + CuCloth(CuFactory&, CuFabric&, Range<const PxVec4>); + CuCloth(CuFactory&, const CuCloth&); + ~CuCloth(); // not virtual on purpose + + public: + bool isSleeping() const + { + return mSleepPassCounter >= mSleepAfterCount; + } + void wakeUp() + { + mSleepPassCounter = 0; + } + + void notifyChanged(); + + bool updateClothData(CuClothData&); // expects acquired context + uint32_t getSharedMemorySize() const; // without particle data + + // expects transformed configs, doesn't call notifyChanged() + void setPhaseConfig(Range<const PhaseConfig>); + + Range<PxVec4> push(CuConstraints&); + void clear(CuConstraints&); + + void syncDeviceParticles(); + void syncHostParticles(); + + Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t); + + public: + CuFactory& mFactory; + CuFabric& mFabric; + + bool mClothDataDirty; + + // particle data + uint32_t mNumParticles; + CuDeviceVector<PxVec4> mParticles; // cur, prev + CuPinnedVector<PxVec4>::Type mParticlesHostCopy; + bool mDeviceParticlesDirty; + bool mHostParticlesDirty; + + PxVec3 mParticleBoundsCenter; + PxVec3 mParticleBoundsHalfExtent; + + PxVec3 mGravity; + PxVec3 mLogDamping; + PxVec3 mLinearLogDrag; + PxVec3 mAngularLogDrag; + PxVec3 mLinearInertia; + PxVec3 mAngularInertia; + PxVec3 mCentrifugalInertia; + float mSolverFrequency; + float mStiffnessFrequency; + + PxTransform mTargetMotion; + PxTransform mCurrentMotion; + PxVec3 mLinearVelocity; + PxVec3 mAngularVelocity; + + float mPrevIterDt; + MovingAverage mIterDtAvg; + + CuDeviceVector<CuPhaseConfig> mPhaseConfigs; // transformed! + Vector<PhaseConfig>::Type mHostPhaseConfigs; // transformed! + + // tether constraints stuff + float mTetherConstraintLogStiffness; + float mTetherConstraintScale; + + // motion constraints stuff + CuConstraints mMotionConstraints; + float mMotionConstraintScale; + float mMotionConstraintBias; + float mMotionConstraintLogStiffness; + + // separation constraints stuff + CuConstraints mSeparationConstraints; + + // particle acceleration stuff + CuDeviceVector<PxVec4> mParticleAccelerations; + CuPinnedVector<PxVec4>::Type mParticleAccelerationsHostCopy; + + // wind + PxVec3 mWind; + float mDragLogCoefficient; + float mLiftLogCoefficient; + + // collision stuff + CuPinnedVector<IndexPair>::Type mCapsuleIndices; + CuPinnedVector<PxVec4>::Type mStartCollisionSpheres; + CuPinnedVector<PxVec4>::Type mTargetCollisionSpheres; + CuPinnedVector<uint32_t>::Type mConvexMasks; + CuPinnedVector<PxVec4>::Type mStartCollisionPlanes; + CuPinnedVector<PxVec4>::Type mTargetCollisionPlanes; + CuPinnedVector<PxVec3>::Type mStartCollisionTriangles; + CuPinnedVector<PxVec3>::Type mTargetCollisionTriangles; + bool mEnableContinuousCollision; + float mCollisionMassScale; + float mFriction; + + // virtual particles + CuDeviceVector<uint32_t> mVirtualParticleSetSizes; + CuDeviceVector<Vec4us> mVirtualParticleIndices; + CuDeviceVector<PxVec4> mVirtualParticleWeights; + + // self collision + float mSelfCollisionDistance; + float mSelfCollisionLogStiffness; + + CuDeviceVector<PxVec4> mRestPositions; + CuDeviceVector<uint32_t> mSelfCollisionIndices; + Vector<uint32_t>::Type mSelfCollisionIndicesHost; + + // 4 (position) + 2 (key) per particle + cellStart (8322) + CuDeviceVector<float> mSelfCollisionData; + + // sleeping (see SwCloth for comments) + uint32_t mSleepTestInterval; + uint32_t mSleepAfterCount; + float mSleepThreshold; + uint32_t mSleepPassCounter; + uint32_t mSleepTestCounter; + + uint32_t mSharedMemorySize; + + void* mUserData; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp new file mode 100644 index 00000000..8b234968 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothClone.cpp @@ -0,0 +1,83 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "ClothClone.h" + +#include "CuFactory.h" +#include "CuFabric.h" +#include "CuCloth.h" + +namespace physx +{ +namespace cloth +{ +Range<const PhaseConfig> getPhaseConfigs(const CuCloth& cloth) +{ + return makeRange(cloth.mHostPhaseConfigs); +} +void setPhaseConfigs(CuCloth& cloth, Range<const PhaseConfig> phaseConfigs) +{ + cloth.setPhaseConfig(phaseConfigs); +} +Range<const PxVec4> getParticleAccelerations(const CuCloth& cloth) +{ + return makeRange(cloth.mParticleAccelerationsHostCopy); +} +Range<const uint32_t> getSelfCollisionIndices(const CuCloth& cloth) +{ + return makeRange(cloth.mSelfCollisionIndicesHost); +} + +template <> +Cloth* ClothImpl<CuCloth>::clone(Factory& factory) const +{ + if(&mCloth.mFactory == &factory) + return new ClothImpl<CuCloth>(factory, *this); // copy construct directly + + switch(factory.getPlatform()) + { + case Factory::CPU: + return convertCloth(static_cast<SwFactory&>(factory), *this); + case Factory::CUDA: + return convertCloth(static_cast<CuFactory&>(factory), *this); + default: + return NULL; + } +} + +Cloth* CuFactory::clone(const Cloth& cloth) +{ + if(cloth.getFactory().getPlatform() == Factory::CPU) + return convertCloth(*this, static_cast<const SwClothImpl&>(cloth)); + + return cloth.clone(*this); +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp new file mode 100644 index 00000000..5a1485c6 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.cpp @@ -0,0 +1,238 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "CuClothData.h" +#include "CuCloth.h" +#include "CuFabric.h" +#include "CuCheckSuccess.h" +#include "CuContextLock.h" +#include "IterationState.h" + +using namespace physx; + +cloth::CuClothData::CuClothData(CuCloth& cloth) +{ + mNumParticles = cloth.mNumParticles; + mParticles = array(*cloth.mParticles.begin().get()); + + mParticlesHostCopy = array(*getDevicePointer(cloth.mParticlesHostCopy)); + + mNumPhases = uint32_t(cloth.mPhaseConfigs.size()); + mPhaseConfigs = cloth.mPhaseConfigs.begin().get(); + + mTethers = cloth.mFabric.mTethers.begin().get(); + mNumTethers = uint32_t(cloth.mFabric.mTethers.size()); + mTetherConstraintScale = cloth.mTetherConstraintScale * cloth.mFabric.mTetherLengthScale; + + mTriangles = cloth.mFabric.mTriangles.begin().get(); + mNumTriangles = uint32_t(cloth.mFabric.mTriangles.size()) / 3; + + mMotionConstraintScale = cloth.mMotionConstraintScale; + mMotionConstraintBias = cloth.mMotionConstraintBias; + + mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size()); + mNumCapsules = uint32_t(cloth.mCapsuleIndices.size()); + mCapsuleIndices = getDevicePointer(cloth.mCapsuleIndices); + + mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size()); + mNumConvexes = uint32_t(cloth.mConvexMasks.size()); + mConvexMasks = getDevicePointer(cloth.mConvexMasks); + + mNumCollisionTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3; + + mVirtualParticleSetSizesBegin = cloth.mVirtualParticleSetSizes.begin().get(); + mVirtualParticleSetSizesEnd = mVirtualParticleSetSizesBegin + cloth.mVirtualParticleSetSizes.size(); + mVirtualParticleIndices = array(*cloth.mVirtualParticleIndices.begin().get()); + mVirtualParticleWeights = array(*cloth.mVirtualParticleWeights.begin().get()); + + mEnableContinuousCollision = cloth.mEnableContinuousCollision; + mCollisionMassScale = cloth.mCollisionMassScale; + mFrictionScale = cloth.mFriction; + + mSelfCollisionDistance = cloth.mSelfCollisionDistance; + mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin().get(); + mNumSelfCollisionIndices = mSelfCollisionIndices ? uint32_t(cloth.mSelfCollisionIndices.size()) : mNumParticles; + + if(!cloth.mSelfCollisionData.empty()) + { + uint32_t keySize = 2 * mNumSelfCollisionIndices; + uint32_t particleSize = 4 * mNumParticles; + + mSelfCollisionParticles = cloth.mSelfCollisionData.begin().get(); + mSelfCollisionKeys = (uint32_t*)(mSelfCollisionParticles + particleSize); + mSelfCollisionCellStart = (uint16_t*)(mSelfCollisionKeys + keySize); + } + else + { + mSelfCollisionParticles = 0; + mSelfCollisionKeys = 0; + mSelfCollisionCellStart = 0; + } + + mSleepTestInterval = cloth.mSleepTestInterval; + mSleepAfterCount = cloth.mSleepAfterCount; + mSleepThreshold = cloth.mSleepThreshold; +} + +cloth::CuFrameData::CuFrameData(CuCloth& cloth, uint32_t numSharedPositions, const IterationState<Simd4f>& state, + const CuIterationData* iterationData) +{ + mDeviceParticlesDirty = cloth.mDeviceParticlesDirty; + + mNumSharedPositions = numSharedPositions; + + mIterDt = state.mIterDt; + mNumIterations = state.mRemainingIterations; + mIterationData = iterationData; + + Simd4f logStiffness = simd4f(0.0f, cloth.mSelfCollisionLogStiffness, cloth.mMotionConstraintLogStiffness, + cloth.mTetherConstraintLogStiffness); + Simd4f stiffnessExponent = simd4f(cloth.mStiffnessFrequency * mIterDt); + Simd4f stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent); + + mTetherConstraintStiffness = array(stiffness)[3]; + mMotionConstraintStiffness = array(stiffness)[2]; + mSelfCollisionStiffness = array(stiffness)[1]; + + logStiffness = simd4f(cloth.mDragLogCoefficient, cloth.mLiftLogCoefficient, 0.0f, 0.0f); + stiffness = gSimd4fOne - exp2(logStiffness * stiffnessExponent); + mDragCoefficient = array(stiffness)[0]; + mLiftCoefficient = array(stiffness)[1]; + for(int i = 0; i < 9; ++i) + mRotation[i] = array(state.mRotationMatrix[i / 3])[i % 3]; + + mTargetMotionConstraints = 0; + if(!cloth.mMotionConstraints.mStart.empty()) + { + mTargetMotionConstraints = array(*cloth.mMotionConstraints.mStart.begin().get()); + } + + mStartMotionConstraints = mTargetMotionConstraints; + if(!cloth.mMotionConstraints.mTarget.empty()) + { + mTargetMotionConstraints = array(*cloth.mMotionConstraints.mTarget.begin().get()); + } + + mHostMotionConstraints = array(*getDevicePointer(cloth.mMotionConstraints.mHostCopy)); + + mTargetSeparationConstraints = 0; + if(!cloth.mSeparationConstraints.mStart.empty()) + { + mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mStart.begin().get()); + } + + mStartSeparationConstraints = mTargetSeparationConstraints; + if(!cloth.mSeparationConstraints.mTarget.empty()) + { + mTargetSeparationConstraints = array(*cloth.mSeparationConstraints.mTarget.begin().get()); + } + + mHostSeparationConstraints = array(*getDevicePointer(cloth.mSeparationConstraints.mHostCopy)); + + mParticleAccelerations = 0; + if(!cloth.mParticleAccelerations.empty()) + { + mParticleAccelerations = array(*cloth.mParticleAccelerations.begin().get()); + } + + mHostParticleAccelerations = array(*getDevicePointer(cloth.mParticleAccelerationsHostCopy)); + + mRestPositions = 0; + if(!cloth.mRestPositions.empty()) + { + mRestPositions = array(*cloth.mRestPositions.begin().get()); + } + + mStartCollisionSpheres = array(*getDevicePointer(cloth.mStartCollisionSpheres)); + mTargetCollisionSpheres = array(*getDevicePointer(cloth.mTargetCollisionSpheres)); + + if(!mTargetCollisionSpheres) + mTargetCollisionSpheres = mStartCollisionSpheres; + + mStartCollisionPlanes = array(*getDevicePointer(cloth.mStartCollisionPlanes)); + mTargetCollisionPlanes = array(*getDevicePointer(cloth.mTargetCollisionPlanes)); + + if(!mTargetCollisionPlanes) + mTargetCollisionPlanes = mStartCollisionPlanes; + + mStartCollisionTriangles = array(*getDevicePointer(cloth.mStartCollisionTriangles)); + mTargetCollisionTriangles = array(*getDevicePointer(cloth.mTargetCollisionTriangles)); + + if(!mTargetCollisionTriangles) + mTargetCollisionTriangles = mStartCollisionTriangles; + + for(uint32_t i = 0; i < 3; ++i) + { + float c = cloth.mParticleBoundsCenter[i]; + float r = cloth.mParticleBoundsHalfExtent[i]; + mParticleBounds[i * 2 + 0] = r + c; + mParticleBounds[i * 2 + 1] = r - c; + } + + mSleepPassCounter = cloth.mSleepPassCounter; + mSleepTestCounter = cloth.mSleepTestCounter; + + mStiffnessExponent = cloth.mStiffnessFrequency * mIterDt; +} + +namespace +{ +void copySquareTransposed(float* dst, const float* src) +{ + dst[0] = src[0]; + dst[1] = src[4]; + dst[2] = src[8]; + dst[3] = src[1]; + dst[4] = src[5]; + dst[5] = src[9]; + dst[6] = src[2]; + dst[7] = src[6]; + dst[8] = src[10]; +} +} + +cloth::CuIterationData::CuIterationData(const IterationState<Simd4f>& state) +{ + mIntegrationTrafo[0] = array(state.mPrevBias)[0]; + mIntegrationTrafo[1] = array(state.mPrevBias)[1]; + mIntegrationTrafo[2] = array(state.mPrevBias)[2]; + + mIntegrationTrafo[3] = array(state.mCurBias)[0]; + mIntegrationTrafo[4] = array(state.mCurBias)[1]; + mIntegrationTrafo[5] = array(state.mCurBias)[2]; + + copySquareTransposed(mIntegrationTrafo + 6, array(*state.mPrevMatrix)); + copySquareTransposed(mIntegrationTrafo + 15, array(*state.mCurMatrix)); + + mWind[0] = array(state.mWind)[0]; + mWind[1] = array(state.mWind)[1]; + mWind[2] = array(state.mWind)[2]; + + mIsTurning = state.mIsTurning ? 0x3F800000u : 0; // 1.0f to avoid ftz +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h new file mode 100644 index 00000000..0be66742 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuClothData.h @@ -0,0 +1,191 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" +#ifndef __CUDACC__ +#include "Simd.h" +#endif + +namespace physx +{ +namespace cloth +{ + +class CuCloth; +struct CuPhaseConfig; +template <typename> +struct IterationState; +struct IndexPair; +struct CuIterationData; +struct CuTether; + +// reference to cloth instance bulk data (POD) +// should not need frequent updates (stored on device) +struct CuClothData +{ + CuClothData() + { + } + CuClothData(CuCloth&); + + // particle data + uint32_t mNumParticles; + float* mParticles; + float* mParticlesHostCopy; + + // fabric constraints + uint32_t mNumPhases; + const CuPhaseConfig* mPhaseConfigs; + + const CuTether* mTethers; + uint32_t mNumTethers; + float mTetherConstraintScale; + + const uint16_t* mTriangles; + uint32_t mNumTriangles; + + // motion constraint data + float mMotionConstraintScale; + float mMotionConstraintBias; + + // collision data + uint32_t mNumSpheres; // don't change this order, it's + uint32_t mNumCapsules; // needed by mergeAcceleration() + const IndexPair* mCapsuleIndices; + uint32_t mNumPlanes; + uint32_t mNumConvexes; + const uint32_t* mConvexMasks; + uint32_t mNumCollisionTriangles; + + // virtual particle data + const uint32_t* mVirtualParticleSetSizesBegin; + const uint32_t* mVirtualParticleSetSizesEnd; + const uint16_t* mVirtualParticleIndices; + const float* mVirtualParticleWeights; + + bool mEnableContinuousCollision; + float mCollisionMassScale; + float mFrictionScale; + + float mSelfCollisionDistance; + uint32_t mNumSelfCollisionIndices; + const uint32_t* mSelfCollisionIndices; + float* mSelfCollisionParticles; + uint32_t* mSelfCollisionKeys; + uint16_t* mSelfCollisionCellStart; + + // sleep data + uint32_t mSleepTestInterval; + uint32_t mSleepAfterCount; + float mSleepThreshold; +}; + +// per-frame data (stored in pinned memory) +struct CuFrameData +{ + CuFrameData() + { + } // not initializing pointers to 0! + +#ifndef __CUDACC__ + explicit CuFrameData(CuCloth&, uint32_t, const IterationState<Simd4f>&, const CuIterationData*); +#endif + + bool mDeviceParticlesDirty; + + // number of particle copies that fit in shared memory (0, 1, or 2) + uint32_t mNumSharedPositions; + + // iteration data + float mIterDt; + uint32_t mNumIterations; + const CuIterationData* mIterationData; + + float mTetherConstraintStiffness; + + // wind data + float mDragCoefficient; + float mLiftCoefficient; + float mRotation[9]; + + // motion constraint data + const float* mStartMotionConstraints; + float* mTargetMotionConstraints; + const float* mHostMotionConstraints; + float mMotionConstraintStiffness; + + // separation constraint data + const float* mStartSeparationConstraints; + float* mTargetSeparationConstraints; + const float* mHostSeparationConstraints; + + // particle acceleration data + float* mParticleAccelerations; + const float* mHostParticleAccelerations; + + // rest positions + const float* mRestPositions; + + // collision data + const float* mStartCollisionSpheres; + const float* mTargetCollisionSpheres; + const float* mStartCollisionPlanes; + const float* mTargetCollisionPlanes; + const float* mStartCollisionTriangles; + const float* mTargetCollisionTriangles; + + float mSelfCollisionStiffness; + + float mParticleBounds[6]; // maxX, -minX, maxY, ... + + uint32_t mSleepPassCounter; + uint32_t mSleepTestCounter; + + float mStiffnessExponent; +}; + +// per-iteration data (stored in pinned memory) +struct CuIterationData +{ + CuIterationData() + { + } // not initializing! + +#ifndef __CUDACC__ + explicit CuIterationData(const IterationState<Simd4f>&); +#endif + + float mIntegrationTrafo[24]; + float mWind[3]; + uint32_t mIsTurning; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h new file mode 100644 index 00000000..cd28a999 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuCollision.h @@ -0,0 +1,1505 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifndef CU_SOLVER_KERNEL_CU +#error include CuCollision.h only from CuSolverKernel.cu +#endif + +#include "IndexPair.h" + +namespace +{ +struct CuCollision +{ + struct ShapeMask + { + uint32_t mSpheres; + uint32_t mCones; + + __device__ friend ShapeMask& operator&=(ShapeMask& left, const ShapeMask& right) + { + left.mSpheres = left.mSpheres & right.mSpheres; + left.mCones = left.mCones & right.mCones; + return left; + } + }; + + struct CollisionData + { + Pointer<Shared, float> mSphereX; + Pointer<Shared, float> mSphereY; + Pointer<Shared, float> mSphereZ; + Pointer<Shared, float> mSphereW; + + Pointer<Shared, float> mConeCenterX; + Pointer<Shared, float> mConeCenterY; + Pointer<Shared, float> mConeCenterZ; + Pointer<Shared, float> mConeRadius; + Pointer<Shared, float> mConeAxisX; + Pointer<Shared, float> mConeAxisY; + Pointer<Shared, float> mConeAxisZ; + Pointer<Shared, float> mConeSlope; + Pointer<Shared, float> mConeSqrCosine; + Pointer<Shared, float> mConeHalfLength; + }; + + public: + __device__ CuCollision(Pointer<Shared, uint32_t>); + + template <typename CurrentT, typename PreviousT> + __device__ void operator()(CurrentT& current, PreviousT& previous, float alpha); + + private: + __device__ void buildSphereAcceleration(const CollisionData&); + __device__ void buildConeAcceleration(); + __device__ void mergeAcceleration(); + + template <typename CurrentT> + __device__ bool buildAcceleration(const CurrentT&, float); + + __device__ static ShapeMask readShapeMask(const float&, Pointer<Shared, const uint32_t>); + template <typename CurPos> + __device__ ShapeMask getShapeMask(const CurPos&) const; + template <typename PrevPos, typename CurPos> + __device__ ShapeMask getShapeMask(const PrevPos&, const CurPos&) const; + + template <typename CurPos> + __device__ int32_t collideCapsules(const CurPos&, float3&, float3&) const; + template <typename PrevPos, typename CurPos> + __device__ int32_t collideCapsules(const PrevPos&, CurPos&, float3&, float3&) const; + + template <typename CurrentT, typename PreviousT> + __device__ void collideCapsules(CurrentT& current, PreviousT& previous) const; + template <typename CurrentT, typename PreviousT> + __device__ void collideVirtualCapsules(CurrentT& current, PreviousT& previous) const; + template <typename CurrentT, typename PreviousT> + __device__ void collideContinuousCapsules(CurrentT& current, PreviousT& previous) const; + + template <typename CurrentT, typename PreviousT> + __device__ void collideConvexes(CurrentT& current, PreviousT& previous, float alpha); + template <typename CurPos> + __device__ int32_t collideConvexes(const CurPos&, float3&) const; + + template <typename CurrentT> + __device__ void collideTriangles(CurrentT& current, float alpha); + template <typename CurrentT> + __device__ void collideTriangles(CurrentT& current, int32_t i); + + public: + Pointer<Shared, uint32_t> mCapsuleIndices; + Pointer<Shared, uint32_t> mCapsuleMasks; + Pointer<Shared, uint32_t> mConvexMasks; + + CollisionData mPrevData; + CollisionData mCurData; + + // acceleration structure + Pointer<Shared, uint32_t> mShapeGrid; + float mGridScale[3]; + float mGridBias[3]; + static const uint32_t sGridSize = 8; +}; + +template <typename T> +__device__ void swap(T& a, T& b) +{ + T c = a; + a = b; + b = c; +} +} + +__shared__ uninitialized<CuCollision> gCollideParticles; + +namespace +{ +// initializes one pointer past data! +__device__ void allocate(CuCollision::CollisionData& data) +{ + if(threadIdx.x < 15) + { + Pointer<Shared, float>* ptr = &data.mSphereX; + ptr[threadIdx.x] = *ptr + threadIdx.x * gClothData.mNumCapsules + + min(threadIdx.x, 4) * (gClothData.mNumSpheres - gClothData.mNumCapsules); + } +} + +__device__ void generateSpheres(CuCollision::CollisionData& data, float alpha) +{ + // interpolate spheres and transpose + if(threadIdx.x < gClothData.mNumSpheres * 4) + { + float start = __ldg(gFrameData.mStartCollisionSpheres + threadIdx.x); + float target = __ldg(gFrameData.mTargetCollisionSpheres + threadIdx.x); + float value = start + (target - start) * alpha; + if(threadIdx.x % 4 == 3) + value = max(value, 0.0f); + int32_t j = threadIdx.x % 4 * gClothData.mNumSpheres + threadIdx.x / 4; + data.mSphereX[j] = value; + } + + __syncthreads(); +} + +__device__ void generateCones(CuCollision::CollisionData& data, Pointer<Shared, const uint32_t> iIt) +{ + // generate cones + if(threadIdx.x < gClothData.mNumCapsules) + { + uint32_t firstIndex = iIt[0]; + uint32_t secondIndex = iIt[1]; + + float firstX = data.mSphereX[firstIndex]; + float firstY = data.mSphereY[firstIndex]; + float firstZ = data.mSphereZ[firstIndex]; + float firstW = data.mSphereW[firstIndex]; + + float secondX = data.mSphereX[secondIndex]; + float secondY = data.mSphereY[secondIndex]; + float secondZ = data.mSphereZ[secondIndex]; + float secondW = data.mSphereW[secondIndex]; + + float axisX = (secondX - firstX) * 0.5f; + float axisY = (secondY - firstY) * 0.5f; + float axisZ = (secondZ - firstZ) * 0.5f; + float axisW = (secondW - firstW) * 0.5f; + + float sqrAxisLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + float sqrConeLength = sqrAxisLength - axisW * axisW; + + float invAxisLength = rsqrtf(sqrAxisLength); + float invConeLength = rsqrtf(sqrConeLength); + + if(sqrConeLength <= 0.0f) + invAxisLength = invConeLength = 0.0f; + + float axisLength = sqrAxisLength * invAxisLength; + + data.mConeCenterX[threadIdx.x] = (secondX + firstX) * 0.5f; + data.mConeCenterY[threadIdx.x] = (secondY + firstY) * 0.5f; + data.mConeCenterZ[threadIdx.x] = (secondZ + firstZ) * 0.5f; + data.mConeRadius[threadIdx.x] = (axisW + firstW) * invConeLength * axisLength; + + data.mConeAxisX[threadIdx.x] = axisX * invAxisLength; + data.mConeAxisY[threadIdx.x] = axisY * invAxisLength; + data.mConeAxisZ[threadIdx.x] = axisZ * invAxisLength; + data.mConeSlope[threadIdx.x] = axisW * invConeLength; + + float sine = axisW * invAxisLength; + data.mConeSqrCosine[threadIdx.x] = 1 - sine * sine; + data.mConeHalfLength[threadIdx.x] = axisLength; + } + + __syncthreads(); +} +} + +__device__ CuCollision::CuCollision(Pointer<Shared, uint32_t> scratchPtr) +{ + int32_t numCapsules2 = 2 * gClothData.mNumCapsules; + int32_t numCapsules4 = 4 * gClothData.mNumCapsules; + int32_t numConvexes = gClothData.mNumConvexes; + + if(threadIdx.x < 3) + { + (&mCapsuleIndices)[threadIdx.x] = scratchPtr + threadIdx.x * numCapsules2; + (&mShapeGrid)[-14 * int32_t(threadIdx.x)] = scratchPtr + numCapsules4 + numConvexes; + } + + Pointer<Shared, uint32_t> indexPtr = scratchPtr + threadIdx.x; + if(threadIdx.x < numCapsules2) + { + uint32_t index = (&gClothData.mCapsuleIndices->first)[threadIdx.x]; + *indexPtr = index; + + volatile uint32_t* maskPtr = generic(indexPtr + numCapsules2); + *maskPtr = 1u << index; + *maskPtr |= maskPtr[-int32_t(threadIdx.x & 1)]; + } + indexPtr += numCapsules4; + + if(threadIdx.x < numConvexes) + *indexPtr = gClothData.mConvexMasks[threadIdx.x]; + + if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f) + { + allocate(mPrevData); + + __syncthreads(); // mPrevData raw hazard + + generateSpheres(mPrevData, 0.0f); + generateCones(mPrevData, mCapsuleIndices + 2 * threadIdx.x); + } + + allocate(mCurData); // also initializes mShapeGrid (!) +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::operator()(CurrentT& current, PreviousT& previous, float alpha) +{ + // if(current.w > 0) current.w = previous.w (see SwSolverKernel::computeBounds()) + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + if(current(i, 3) > 0.0f) + current(i, 3) = previous(i, 3); + } + + collideConvexes(current, previous, alpha); + collideTriangles(current, alpha); + + if(buildAcceleration(current, alpha)) + { + if(gClothData.mEnableContinuousCollision) + collideContinuousCapsules(current, previous); + else + collideCapsules(current, previous); + + collideVirtualCapsules(current, previous); + } + + // sync otherwise first threads overwrite sphere data before + // remaining ones have had a chance to use it leading to incorrect + // velocity calculation for friction / ccd + + __syncthreads(); + + if(gClothData.mEnableContinuousCollision || gClothData.mFrictionScale > 0.0f) + { + // store current collision data for next iteration + Pointer<Shared, float> dstIt = mPrevData.mSphereX + threadIdx.x; + Pointer<Shared, const float> srcIt = mCurData.mSphereX + threadIdx.x; + for(; dstIt < mCurData.mSphereX; dstIt += blockDim.x, srcIt += blockDim.x) + *dstIt = *srcIt; + } + + // __syncthreads() called in updateSleepState() +} + +// build per-axis mask arrays of spheres on the right/left of grid cell +__device__ void CuCollision::buildSphereAcceleration(const CollisionData& data) +{ + if(threadIdx.x >= 192) + return; + + int32_t sphereIdx = threadIdx.x & 31; + int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) + int32_t signi = threadIdx.x << 26 & 0x80000000; // sign bit (min or max) + + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + float pos = signf * data.mSphereW[sphereIdx] + data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; + + // use overflow so we can test for non-positive + uint32_t index = signi - uint32_t(floorf(pos * mGridScale[axisIdx] + mGridBias[axisIdx])); + + axisIdx += (uint32_t(signi) >> 31) * 3; + Pointer<Shared, uint32_t> dst = mShapeGrid + sGridSize * axisIdx; + // #pragma unroll + for(int32_t i = 0; i < sGridSize; ++i, ++index) + dst[i] |= __ballot(int32_t(index) <= 0); +} + +// generate cone masks from sphere masks +__device__ void CuCollision::buildConeAcceleration() +{ + if(threadIdx.x >= 192) + return; + + int32_t coneIdx = threadIdx.x & 31; + + uint32_t sphereMask = + mCurData.mConeRadius[coneIdx] && coneIdx < gClothData.mNumCapsules ? mCapsuleMasks[2 * coneIdx + 1] : 0; + + int32_t offset = threadIdx.x / 32 * sGridSize; + Pointer<Shared, uint32_t> src = mShapeGrid + offset; + Pointer<Shared, uint32_t> dst = src + 6 * sGridSize; + + // #pragma unroll + for(int32_t i = 0; i < sGridSize; ++i) + dst[i] |= __ballot(src[i] & sphereMask); +} + +// convert right/left mask arrays into single overlap array +__device__ void CuCollision::mergeAcceleration() +{ + if(threadIdx.x < sGridSize * 12) + { + Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x; + if(!(gClothData.mEnableContinuousCollision || threadIdx.x * 43 & 1024)) + *dst &= dst[sGridSize * 3]; // above is same as 'threadIdx.x/24 & 1' + + // mask garbage bits from build*Acceleration + int32_t shapeIdx = threadIdx.x >= sGridSize * 6; // spheres=0, cones=1 + *dst &= (1 << (&gClothData.mNumSpheres)[shapeIdx]) - 1; + } +} + +namespace +{ +#if __CUDA_ARCH__ >= 300 +__device__ float mergeBounds(Pointer<Shared, float> buffer) +{ + float value = *buffer; + value = max(value, __shfl_down(value, 1)); + value = max(value, __shfl_down(value, 2)); + value = max(value, __shfl_down(value, 4)); + value = max(value, __shfl_down(value, 8)); + return max(value, __shfl_down(value, 16)); +} +#else +__device__ float mergeBounds(Pointer<Shared, float> buffer) +{ + // ensure that writes to buffer are visible to all threads + __threadfence_block(); + + volatile float* ptr = generic(buffer); + *ptr = max(*ptr, ptr[16]); + *ptr = max(*ptr, ptr[8]); + *ptr = max(*ptr, ptr[4]); + *ptr = max(*ptr, ptr[2]); + return max(*ptr, ptr[1]); +} +#endif +// computes maxX, -minX, maxY, ... with a stride of 32, threadIdx.x must be < 192 +__device__ float computeSphereBounds(const CuCollision::CollisionData& data, Pointer<Shared, float> buffer) +{ + assert(threadIdx.x < 192); + + int32_t sphereIdx = min(threadIdx.x & 31, gClothData.mNumSpheres - 1); // sphere index + int32_t axisIdx = threadIdx.x >> 6; // coordinate index (x, y, or z) + int32_t signi = threadIdx.x << 26; // sign bit (min or max) + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + + *buffer = data.mSphereW[sphereIdx] + signf * data.mSphereX[sphereIdx + gClothData.mNumSpheres * axisIdx]; + + return mergeBounds(buffer); +} + +#if __CUDA_ARCH__ >= 300 +template <typename CurrentT> +__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer) +{ + int32_t numThreadsPerAxis = blockDim.x * 342 >> 10 & ~31; // same as / 3 + int32_t axis = (threadIdx.x >= numThreadsPerAxis) + (threadIdx.x >= 2 * numThreadsPerAxis); + int32_t threadIdxInAxis = threadIdx.x - axis * numThreadsPerAxis; + int laneIdx = threadIdx.x & 31; + + if(threadIdxInAxis < numThreadsPerAxis) + { + typename CurrentT::ConstPointerType posIt = current[axis]; + int32_t i = min(threadIdxInAxis, gClothData.mNumParticles - 1); + float minX = posIt[i], maxX = minX; + while(i += numThreadsPerAxis, i < gClothData.mNumParticles) + { + float posX = posIt[i]; + minX = min(minX, posX); + maxX = max(maxX, posX); + } + + minX = min(minX, __shfl_down(minX, 1)); + maxX = max(maxX, __shfl_down(maxX, 1)); + minX = min(minX, __shfl_down(minX, 2)); + maxX = max(maxX, __shfl_down(maxX, 2)); + minX = min(minX, __shfl_down(minX, 4)); + maxX = max(maxX, __shfl_down(maxX, 4)); + minX = min(minX, __shfl_down(minX, 8)); + maxX = max(maxX, __shfl_down(maxX, 8)); + minX = min(minX, __shfl_down(minX, 16)); + maxX = max(maxX, __shfl_down(maxX, 16)); + + if(!laneIdx) + { + Pointer<Shared, float> dst = buffer - threadIdx.x + (threadIdxInAxis >> 5) + (axis << 6); + dst[0] = maxX; + dst[32] = -minX; + } + } + + __syncthreads(); + + if(threadIdx.x >= 192) + return 0.0f; + + float value = *buffer; + if(laneIdx >= (numThreadsPerAxis >> 5)) + value = -FLT_MAX; + + // blockDim.x <= 3*512, increase to 3*1024 by adding a shfl by 16 + assert(numThreadsPerAxis <= 16 * 32); + + value = max(value, __shfl_down(value, 1)); + value = max(value, __shfl_down(value, 2)); + value = max(value, __shfl_down(value, 4)); + return max(value, __shfl_down(value, 8)); +} +#else +template <typename CurrentT> +__device__ float computeParticleBounds(const CurrentT& current, Pointer<Shared, float> buffer) +{ + if(threadIdx.x >= 192) + return 0.0f; + + int32_t axisIdx = threadIdx.x >> 6; // x, y, or z + int32_t signi = threadIdx.x << 26; // sign bit (min or max) + float signf = copysignf(1.0f, reinterpret_cast<const float&>(signi)); + + typename CurrentT::ConstPointerType pIt = current[axisIdx]; + typename CurrentT::ConstPointerType pEnd = pIt + gClothData.mNumParticles; + pIt += min(threadIdx.x & 31, gClothData.mNumParticles - 1); + + *buffer = *pIt * signf; + while(pIt += 32, pIt < pEnd) + *buffer = max(*buffer, *pIt * signf); + + return mergeBounds(buffer); +} +#endif +} + +// build mask of spheres/cones touching a regular grid along each axis +template <typename CurrentT> +__device__ bool CuCollision::buildAcceleration(const CurrentT& current, float alpha) +{ + // use still unused cone data as buffer for bounds computation + Pointer<Shared, float> buffer = mCurData.mConeCenterX + threadIdx.x; + float curParticleBounds = computeParticleBounds(current, buffer); + int32_t warpIdx = threadIdx.x >> 5; + + if(!gClothData.mNumSpheres) + { + if(threadIdx.x < 192 && !(threadIdx.x & 31)) + gFrameData.mParticleBounds[warpIdx] = curParticleBounds; + return false; + } + + generateSpheres(mCurData, alpha); + + if(threadIdx.x < 192) + { + float sphereBounds = computeSphereBounds(mCurData, buffer); + float particleBounds = curParticleBounds; + if(gClothData.mEnableContinuousCollision) + { + sphereBounds = max(sphereBounds, computeSphereBounds(mPrevData, buffer)); + float prevParticleBounds = gFrameData.mParticleBounds[warpIdx]; + particleBounds = max(particleBounds, prevParticleBounds); + } + + float bounds = min(sphereBounds, particleBounds); + float expandedBounds = bounds + abs(bounds) * 1e-4f; + + // store bounds data in shared memory + if(!(threadIdx.x & 31)) + { + mGridScale[warpIdx] = expandedBounds; + gFrameData.mParticleBounds[warpIdx] = curParticleBounds; + } + } + + __syncthreads(); // mGridScale raw hazard + + if(threadIdx.x < 3) + { + float negativeLower = mGridScale[threadIdx.x * 2 + 1]; + float edgeLength = mGridScale[threadIdx.x * 2] + negativeLower; + float divisor = max(edgeLength, FLT_EPSILON); + mGridScale[threadIdx.x] = __fdividef(sGridSize - 1e-3, divisor); + mGridBias[threadIdx.x] = negativeLower * mGridScale[threadIdx.x]; + if(edgeLength < 0.0f) + mGridScale[0] = 0.0f; // mark empty intersection + } + + // initialize sphere *and* cone grid to 0 + if(threadIdx.x < 2 * 6 * sGridSize) + mShapeGrid[threadIdx.x] = 0; + + __syncthreads(); // mGridScale raw hazard + + // generate cones even if test below fails because + // continuous collision might need it in next iteration + generateCones(mCurData, mCapsuleIndices + 2 * threadIdx.x); + + if(mGridScale[0] == 0.0f) + return false; // early out for empty intersection + + if(gClothData.mEnableContinuousCollision) + buildSphereAcceleration(mPrevData); + buildSphereAcceleration(mCurData); + __syncthreads(); // mCurData raw hazard + + buildConeAcceleration(); + __syncthreads(); // mShapeGrid raw hazard + + mergeAcceleration(); + __syncthreads(); // mShapeGrid raw hazard + + return true; +} + +__device__ CuCollision::ShapeMask CuCollision::readShapeMask(const float& position, + Pointer<Shared, const uint32_t> sphereGrid) +{ + ShapeMask result; + int32_t index = int32_t(floorf(position)); + uint32_t outMask = (index < sGridSize) - 1; + + Pointer<Shared, const uint32_t> gridPtr = sphereGrid + (index & sGridSize - 1); + result.mSpheres = gridPtr[0] & ~outMask; + result.mCones = gridPtr[sGridSize * 6] & ~outMask; + + return result; +} + +// lookup acceleration structure and return mask of potential intersectors +template <typename CurPos> +__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const CurPos& positions) const +{ + ShapeMask result; + + result = readShapeMask(positions.x * mGridScale[0] + mGridBias[0], mShapeGrid); + result &= readShapeMask(positions.y * mGridScale[1] + mGridBias[1], mShapeGrid + 8); + result &= readShapeMask(positions.z * mGridScale[2] + mGridBias[2], mShapeGrid + 16); + + return result; +} + +template <typename PrevPos, typename CurPos> +__device__ CuCollision::ShapeMask CuCollision::getShapeMask(const PrevPos& prevPos, const CurPos& curPos) const +{ + ShapeMask result; + + float prevX = prevPos.x * mGridScale[0] + mGridBias[0]; + float prevY = prevPos.y * mGridScale[1] + mGridBias[1]; + float prevZ = prevPos.z * mGridScale[2] + mGridBias[2]; + + float curX = curPos.x * mGridScale[0] + mGridBias[0]; + float curY = curPos.y * mGridScale[1] + mGridBias[1]; + float curZ = curPos.z * mGridScale[2] + mGridBias[2]; + + float maxX = min(max(prevX, curX), 7.0f); + float maxY = min(max(prevY, curY), 7.0f); + float maxZ = min(max(prevZ, curZ), 7.0f); + + result = readShapeMask(maxX, mShapeGrid); + result &= readShapeMask(maxY, mShapeGrid + 8); + result &= readShapeMask(maxZ, mShapeGrid + 16); + + float minX = max(min(prevX, curX), 0.0f); + float minY = max(min(prevY, curY), 0.0f); + float minZ = max(min(prevZ, curZ), 0.0f); + + result &= readShapeMask(minX, mShapeGrid + 24); + result &= readShapeMask(minY, mShapeGrid + 32); + result &= readShapeMask(minZ, mShapeGrid + 40); + + return result; +} + +template <typename CurPos> +__device__ int32_t CuCollision::collideCapsules(const CurPos& positions, float3& delta, float3& velocity) const +{ + ShapeMask shapeMask = getShapeMask(positions); + + delta.x = delta.y = delta.z = 0.0f; + velocity.x = velocity.y = velocity.z = 0.0f; + + int32_t numCollisions = 0; + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + // cone collision + for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1) + { + int32_t j = __ffs(shapeMask.mCones) - 1; + + float deltaX = positions.x - mCurData.mConeCenterX[j]; + float deltaY = positions.y - mCurData.mConeCenterY[j]; + float deltaZ = positions.z - mCurData.mConeCenterZ[j]; + + float axisX = mCurData.mConeAxisX[j]; + float axisY = mCurData.mConeAxisY[j]; + float axisZ = mCurData.mConeAxisZ[j]; + float slope = mCurData.mConeSlope[j]; + + float dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ; + float radius = max(dot * slope + mCurData.mConeRadius[j], 0.0f); + float sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot; + + Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j; + uint32_t bothMask = mIt[1]; + + if(sqrDistance > radius * radius) + { + shapeMask.mSpheres &= ~bothMask; + continue; + } + + sqrDistance = max(sqrDistance, FLT_EPSILON); + float invDistance = rsqrtf(sqrDistance); + + float base = dot + slope * sqrDistance * invDistance; + + float halfLength = mCurData.mConeHalfLength[j]; + uint32_t leftMask = base < -halfLength; + uint32_t rightMask = base > halfLength; + + uint32_t firstMask = mIt[0]; + uint32_t secondMask = firstMask ^ bothMask; + + shapeMask.mSpheres &= ~(firstMask & leftMask - 1); + shapeMask.mSpheres &= ~(secondMask & rightMask - 1); + + if(!leftMask && !rightMask) + { + deltaX = deltaX - base * axisX; + deltaY = deltaY - base * axisY; + deltaZ = deltaZ - base * axisZ; + + float sqrCosine = mCurData.mConeSqrCosine[j]; + float scale = radius * invDistance * sqrCosine - sqrCosine; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if(frictionEnabled) + { + int32_t s0 = mCapsuleIndices[2 * j]; + int32_t s1 = mCapsuleIndices[2 * j + 1]; + + // load previous sphere pos + float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0]; + float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0]; + float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0]; + + float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1]; + float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1]; + float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1]; + + // interpolate velocity between the two spheres + float t = dot * 0.5f + 0.5f; + + velocity.x += s0vx + t * (s1vx - s0vx); + velocity.y += s0vy + t * (s1vy - s0vy); + velocity.z += s0vz + t * (s1vz - s0vz); + } + + ++numCollisions; + } + } + + // sphere collision + for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1) + { + int32_t j = __ffs(shapeMask.mSpheres) - 1; + + float deltaX = positions.x - mCurData.mSphereX[j]; + float deltaY = positions.y - mCurData.mSphereY[j]; + float deltaZ = positions.z - mCurData.mSphereZ[j]; + + float sqrDistance = FLT_EPSILON + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ; + float relDistance = rsqrtf(sqrDistance) * mCurData.mSphereW[j]; + + if(relDistance > 1.0f) + { + float scale = relDistance - 1.0f; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if(frictionEnabled) + { + velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j]; + velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j]; + velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j]; + } + + ++numCollisions; + } + } + + return numCollisions; +} + +static const __device__ float gSkeletonWidth = (1 - 0.2f) * (1 - 0.2f) - 1; + +template <typename PrevPos, typename CurPos> +__device__ int32_t +CuCollision::collideCapsules(const PrevPos& prevPos, CurPos& curPos, float3& delta, float3& velocity) const +{ + ShapeMask shapeMask = getShapeMask(prevPos, curPos); + + delta.x = delta.y = delta.z = 0.0f; + velocity.x = velocity.y = velocity.z = 0.0f; + + int32_t numCollisions = 0; + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + // cone collision + for(; shapeMask.mCones; shapeMask.mCones &= shapeMask.mCones - 1) + { + int32_t j = __ffs(shapeMask.mCones) - 1; + + float prevAxisX = mPrevData.mConeAxisX[j]; + float prevAxisY = mPrevData.mConeAxisY[j]; + float prevAxisZ = mPrevData.mConeAxisZ[j]; + float prevSlope = mPrevData.mConeSlope[j]; + + float prevX = prevPos.x - mPrevData.mConeCenterX[j]; + float prevY = prevPos.y - mPrevData.mConeCenterY[j]; + float prevZ = prevPos.z - mPrevData.mConeCenterZ[j]; + float prevT = prevY * prevAxisZ - prevZ * prevAxisY; + float prevU = prevZ * prevAxisX - prevX * prevAxisZ; + float prevV = prevX * prevAxisY - prevY * prevAxisX; + float prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ; + float prevRadius = max(prevDot * prevSlope + mCurData.mConeRadius[j], 0.0f); + + float curAxisX = mCurData.mConeAxisX[j]; + float curAxisY = mCurData.mConeAxisY[j]; + float curAxisZ = mCurData.mConeAxisZ[j]; + float curSlope = mCurData.mConeSlope[j]; + + float curX = curPos.x - mCurData.mConeCenterX[j]; + float curY = curPos.y - mCurData.mConeCenterY[j]; + float curZ = curPos.z - mCurData.mConeCenterZ[j]; + float curT = curY * curAxisZ - curZ * curAxisY; + float curU = curZ * curAxisX - curX * curAxisZ; + float curV = curX * curAxisY - curY * curAxisX; + float curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + float curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f); + + float curSqrDistance = FLT_EPSILON + curT * curT + curU * curU + curV * curV; + + float dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius; + float dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius; + float dotCurCur = curSqrDistance - curRadius * curRadius; + + float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + float sqrtD = sqrtf(discriminant); + float halfB = dotPrevCur - dotPrevPrev; + float minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside cone + float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA); + bool hasCollision = toi < 1.0f && halfB < sqrtD; + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + hasCollision = hasCollision && minusA < -FLT_EPSILON; + + if(hasCollision) + { + float deltaX = prevX - curX; + float deltaY = prevY - curY; + float deltaZ = prevZ - curZ; + + // interpolate delta at toi + float posX = prevX - deltaX * toi; + float posY = prevY - deltaY * toi; + float posZ = prevZ - deltaZ * toi; + + float curHalfLength = mCurData.mConeHalfLength[j]; + float curScaledAxisX = curAxisX * curHalfLength; + float curScaledAxisY = curAxisY * curHalfLength; + float curScaledAxisZ = curAxisZ * curHalfLength; + + float prevHalfLength = mPrevData.mConeHalfLength[j]; + float deltaScaledAxisX = curScaledAxisX - prevAxisX * prevHalfLength; + float deltaScaledAxisY = curScaledAxisY - prevAxisY * prevHalfLength; + float deltaScaledAxisZ = curScaledAxisZ - prevAxisZ * prevHalfLength; + + float oneMinusToi = 1.0f - toi; + + // interpolate axis at toi + float axisX = curScaledAxisX - deltaScaledAxisX * oneMinusToi; + float axisY = curScaledAxisY - deltaScaledAxisY * oneMinusToi; + float axisZ = curScaledAxisZ - deltaScaledAxisZ * oneMinusToi; + float slope = prevSlope * oneMinusToi + curSlope * toi; + + float sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ; + float invHalfLength = rsqrtf(sqrHalfLength); + float dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength; + + float sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot; + float invDistance = sqrDistance > 0.0f ? rsqrtf(sqrDistance) : 0.0f; + + float base = dot + slope * sqrDistance * invDistance; + float scale = base * invHalfLength; + + if(abs(scale) < 1.0f) + { + deltaX = deltaX + deltaScaledAxisX * scale; + deltaY = deltaY + deltaScaledAxisY * scale; + deltaZ = deltaZ + deltaScaledAxisZ * scale; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + float minusK = __fdividef(sqrtD, minusA * oneMinusToi); + oneMinusToi = __fdividef(oneMinusToi, 1 - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ; + curRadius = max(curDot * curSlope + mCurData.mConeRadius[j], 0.0f); + curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot; + + curPos.x = mCurData.mConeCenterX[j] + curX; + curPos.y = mCurData.mConeCenterY[j] + curY; + curPos.z = mCurData.mConeCenterZ[j] + curZ; + } + } + + // curPos inside cone (discrete collision) + bool hasContact = curRadius * curRadius > curSqrDistance; + + Pointer<Shared, const uint32_t> mIt = mCapsuleMasks + 2 * j; + uint32_t bothMask = mIt[1]; + + uint32_t cullMask = bothMask & (hasCollision | hasContact) - 1; + shapeMask.mSpheres &= ~cullMask; + + if(!hasContact) + continue; + + float invDistance = curSqrDistance > 0.0f ? rsqrtf(curSqrDistance) : 0.0f; + float base = curDot + curSlope * curSqrDistance * invDistance; + + float halfLength = mCurData.mConeHalfLength[j]; + uint32_t leftMask = base < -halfLength; + uint32_t rightMask = base > halfLength; + + // can only skip continuous sphere collision if post-ccd position + // is on code side *and* particle had cone-ccd collision. + uint32_t firstMask = mIt[0]; + uint32_t secondMask = firstMask ^ bothMask; + cullMask = (firstMask & leftMask - 1) | (secondMask & rightMask - 1); + shapeMask.mSpheres &= ~cullMask | hasCollision - 1; + + if(!leftMask && !rightMask) + { + float deltaX = curX - base * curAxisX; + float deltaY = curY - base * curAxisY; + float deltaZ = curZ - base * curAxisZ; + + float sqrCosine = mCurData.mConeSqrCosine[j]; + float scale = curRadius * invDistance * sqrCosine - sqrCosine; + + delta.x = delta.x + deltaX * scale; + delta.y = delta.y + deltaY * scale; + delta.z = delta.z + deltaZ * scale; + + if(frictionEnabled) + { + int32_t s0 = mCapsuleIndices[2 * j]; + int32_t s1 = mCapsuleIndices[2 * j + 1]; + + // load previous sphere pos + float s0vx = mCurData.mSphereX[s0] - mPrevData.mSphereX[s0]; + float s0vy = mCurData.mSphereY[s0] - mPrevData.mSphereY[s0]; + float s0vz = mCurData.mSphereZ[s0] - mPrevData.mSphereZ[s0]; + + float s1vx = mCurData.mSphereX[s1] - mPrevData.mSphereX[s1]; + float s1vy = mCurData.mSphereY[s1] - mPrevData.mSphereY[s1]; + float s1vz = mCurData.mSphereZ[s1] - mPrevData.mSphereZ[s1]; + + // interpolate velocity between the two spheres + float t = curDot * 0.5f + 0.5f; + + velocity.x += s0vx + t * (s1vx - s0vx); + velocity.y += s0vy + t * (s1vy - s0vy); + velocity.z += s0vz + t * (s1vz - s0vz); + } + + ++numCollisions; + } + } + + // sphere collision + for(; shapeMask.mSpheres; shapeMask.mSpheres &= shapeMask.mSpheres - 1) + { + int32_t j = __ffs(shapeMask.mSpheres) - 1; + + float prevX = prevPos.x - mPrevData.mSphereX[j]; + float prevY = prevPos.y - mPrevData.mSphereY[j]; + float prevZ = prevPos.z - mPrevData.mSphereZ[j]; + float prevRadius = mPrevData.mSphereW[j]; + + float curX = curPos.x - mCurData.mSphereX[j]; + float curY = curPos.y - mCurData.mSphereY[j]; + float curZ = curPos.z - mCurData.mSphereZ[j]; + float curRadius = mCurData.mSphereW[j]; + + float sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ; + + float dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius; + float dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius; + float dotCurCur = sqrDistance - curRadius * curRadius; + + float discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev; + float sqrtD = sqrtf(discriminant); + float halfB = dotPrevCur - dotPrevPrev; + float minusA = dotPrevCur - dotCurCur + halfB; + + // time of impact or 0 if prevPos inside sphere + float toi = __fdividef(min(0.0f, halfB + sqrtD), minusA); + bool hasCollision = toi < 1.0f && halfB < sqrtD; + + // skip continuous collision if the (un-clamped) particle + // trajectory only touches the outer skin of the cone. + float rMin = prevRadius + halfB * minusA * (curRadius - prevRadius); + hasCollision = hasCollision && (discriminant > minusA * rMin * rMin * gSkeletonWidth); + + // a is negative when one cone is contained in the other, + // which is already handled by discrete collision. + hasCollision = hasCollision && minusA < -FLT_EPSILON; + + if(hasCollision) + { + float deltaX = prevX - curX; + float deltaY = prevY - curY; + float deltaZ = prevZ - curZ; + + float oneMinusToi = 1.0f - toi; + + // reduce ccd impulse if (clamped) particle trajectory stays in cone skin, + // i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi) + float minusK = __fdividef(sqrtD, minusA * oneMinusToi); + oneMinusToi = __fdividef(oneMinusToi, 1 - minusK); + + curX = curX + deltaX * oneMinusToi; + curY = curY + deltaY * oneMinusToi; + curZ = curZ + deltaZ * oneMinusToi; + + curPos.x = mCurData.mSphereX[j] + curX; + curPos.y = mCurData.mSphereY[j] + curY; + curPos.z = mCurData.mSphereZ[j] + curZ; + + sqrDistance = FLT_EPSILON + curX * curX + curY * curY + curZ * curZ; + } + + float relDistance = rsqrtf(sqrDistance) * curRadius; + + if(relDistance > 1.0f) + { + float scale = relDistance - 1.0f; + + delta.x = delta.x + curX * scale; + delta.y = delta.y + curY * scale; + delta.z = delta.z + curZ * scale; + + if(frictionEnabled) + { + velocity.x += mCurData.mSphereX[j] - mPrevData.mSphereX[j]; + velocity.y += mCurData.mSphereY[j] - mPrevData.mSphereY[j]; + velocity.z += mCurData.mSphereZ[j] - mPrevData.mSphereZ[j]; + } + + ++numCollisions; + } + } + + return numCollisions; +} + +namespace +{ +template <typename PrevPos, typename CurPos> +__device__ inline float3 calcFrictionImpulse(const PrevPos& prevPos, const CurPos& curPos, const float3& shapeVelocity, + float scale, const float3& collisionImpulse) +{ + const float frictionScale = gClothData.mFrictionScale; + + // calculate collision normal + float deltaSq = collisionImpulse.x * collisionImpulse.x + collisionImpulse.y * collisionImpulse.y + + collisionImpulse.z * collisionImpulse.z; + + float rcpDelta = rsqrtf(deltaSq + FLT_EPSILON); + + float nx = collisionImpulse.x * rcpDelta; + float ny = collisionImpulse.y * rcpDelta; + float nz = collisionImpulse.z * rcpDelta; + + // calculate relative velocity scaled by number of collision + float rvx = curPos.x - prevPos.x - shapeVelocity.x * scale; + float rvy = curPos.y - prevPos.y - shapeVelocity.y * scale; + float rvz = curPos.z - prevPos.z - shapeVelocity.z * scale; + + // calculate magnitude of relative normal velocity + float rvn = rvx * nx + rvy * ny + rvz * nz; + + // calculate relative tangential velocity + float rvtx = rvx - rvn * nx; + float rvty = rvy - rvn * ny; + float rvtz = rvz - rvn * nz; + + // calculate magnitude of vt + float rcpVt = rsqrtf(rvtx * rvtx + rvty * rvty + rvtz * rvtz + FLT_EPSILON); + + // magnitude of friction impulse (cannot be larger than -|vt|) + float j = max(-frictionScale * deltaSq * rcpDelta * scale * rcpVt, -1.0f); + + return make_float3(rvtx * j, rvty * j, rvtz * j); +} +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideCapsules(CurrentT& current, PreviousT& previous) const +{ + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename CurrentT::VectorType curPos = current(i); + + float3 delta, velocity; + if(int32_t numCollisions = collideCapsules(curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + + if(frictionEnabled) + { + typename PreviousT::VectorType prevPos = previous(i); + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + + if(massScaleEnabled) + { + float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z; + float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq; + current(i, 3) = __fdividef(current(i, 3), massScale); + } + } + } +} + +namespace +{ +template <typename PointerT> +__device__ float lerp(PointerT pos, const int4& indices, const float4& weights) +{ + return pos[indices.x] * weights.x + pos[indices.y] * weights.y + pos[indices.z] * weights.z; +} + +template <typename PointerT> +__device__ void apply(PointerT pos, const int4& indices, const float4& weights, float delta) +{ + pos[indices.x] += delta * weights.x; + pos[indices.y] += delta * weights.y; + pos[indices.z] += delta * weights.z; +} +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideVirtualCapsules(CurrentT& current, PreviousT& previous) const +{ + const uint32_t* __restrict setSizeIt = gClothData.mVirtualParticleSetSizesBegin; + + if(!setSizeIt) + return; + + if(gClothData.mEnableContinuousCollision) + { + // copied from mergeAcceleration + Pointer<Shared, uint32_t> dst = mShapeGrid + threadIdx.x; + if(!(threadIdx.x * 43 & 1024) && threadIdx.x < sGridSize * 12) + *dst &= dst[sGridSize * 3]; + __syncthreads(); // mShapeGrid raw hazard + } + + const uint32_t* __restrict setSizeEnd = gClothData.mVirtualParticleSetSizesEnd; + const uint16_t* __restrict indicesEnd = gClothData.mVirtualParticleIndices; + const float4* __restrict weightsIt = reinterpret_cast<const float4*>(gClothData.mVirtualParticleWeights); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for(; setSizeIt != setSizeEnd; ++setSizeIt) + { + __syncthreads(); + + const uint16_t* __restrict indicesIt = indicesEnd + threadIdx.x * 4; + for(indicesEnd += *setSizeIt * 4; indicesIt < indicesEnd; indicesIt += blockDim.x * 4) + { + int4 indices = make_int4(indicesIt[0], indicesIt[1], indicesIt[2], indicesIt[3]); + + float4 weights = weightsIt[indices.w]; + + float3 curPos; + curPos.x = lerp(current[0], indices, weights); + curPos.y = lerp(current[1], indices, weights); + curPos.z = lerp(current[2], indices, weights); + + float3 delta, velocity; + if(int32_t numCollisions = collideCapsules(curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + float wscale = weights.w * scale; + + apply(current[0], indices, weights, delta.x * wscale); + apply(current[1], indices, weights, delta.y * wscale); + apply(current[2], indices, weights, delta.z * wscale); + + if(frictionEnabled) + { + float3 prevPos; + prevPos.x = lerp(previous[0], indices, weights); + prevPos.y = lerp(previous[1], indices, weights); + prevPos.z = lerp(previous[2], indices, weights); + + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + apply(previous[0], indices, weights, frictionImpulse.x * -weights.w); + apply(previous[1], indices, weights, frictionImpulse.y * -weights.w); + apply(previous[2], indices, weights, frictionImpulse.z * -weights.w); + } + + if(massScaleEnabled) + { + float deltaLengthSq = (delta.x * delta.x + delta.y * delta.y + delta.z * delta.z) * scale * scale; + float invMassScale = __fdividef(1.0f, 1.0f + gClothData.mCollisionMassScale * deltaLengthSq); + + // not multiplying by weights[3] here because unlike applying velocity + // deltas where we want the interpolated position to obtain a particular + // value, we instead just require that the total change is equal to invMassScale + invMassScale = invMassScale - 1.0f; + current(indices.x, 3) *= 1.0f + weights.x * invMassScale; + current(indices.y, 3) *= 1.0f + weights.y * invMassScale; + current(indices.z, 3) *= 1.0f + weights.z * invMassScale; + } + } + } + } +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideContinuousCapsules(CurrentT& current, PreviousT& previous) const +{ + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + bool massScaleEnabled = gClothData.mCollisionMassScale > 0.0f; + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename PreviousT::VectorType prevPos = previous(i); + typename CurrentT::VectorType curPos = current(i); + + float3 delta, velocity; + if(int32_t numCollisions = collideCapsules(prevPos, curPos, delta, velocity)) + { + float scale = __fdividef(1.0f, numCollisions); + + if(frictionEnabled) + { + float3 frictionImpulse = calcFrictionImpulse(prevPos, curPos, velocity, scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + + if(massScaleEnabled) + { + float deltaLengthSq = delta.x * delta.x + delta.y * delta.y + delta.z * delta.z; + float massScale = 1.0f + gClothData.mCollisionMassScale * deltaLengthSq; + current(i, 3) = __fdividef(current(i, 3), massScale); + } + } + } +} + +template <typename CurPos> +__device__ int32_t CuCollision::collideConvexes(const CurPos& positions, float3& delta) const +{ + delta.x = delta.y = delta.z = 0.0f; + + Pointer<Shared, const float> planeX = mCurData.mSphereX; + Pointer<Shared, const float> planeY = planeX + gClothData.mNumPlanes; + Pointer<Shared, const float> planeZ = planeY + gClothData.mNumPlanes; + Pointer<Shared, const float> planeW = planeZ + gClothData.mNumPlanes; + + int32_t numCollisions = 0; + Pointer<Shared, const uint32_t> cIt = mConvexMasks; + Pointer<Shared, const uint32_t> cEnd = cIt + gClothData.mNumConvexes; + for(; cIt != cEnd; ++cIt) + { + uint32_t mask = *cIt; + + int32_t maxIndex = __ffs(mask) - 1; + float maxDist = planeW[maxIndex] + positions.z * planeZ[maxIndex] + positions.y * planeY[maxIndex] + + positions.x * planeX[maxIndex]; + + while((maxDist < 0.0f) && (mask &= mask - 1)) + { + int32_t i = __ffs(mask) - 1; + float dist = planeW[i] + positions.z * planeZ[i] + positions.y * planeY[i] + positions.x * planeX[i]; + if(dist > maxDist) + maxDist = dist, maxIndex = i; + } + + if(maxDist < 0.0f) + { + delta.x -= planeX[maxIndex] * maxDist; + delta.y -= planeY[maxIndex] * maxDist; + delta.z -= planeZ[maxIndex] * maxDist; + + ++numCollisions; + } + } + + return numCollisions; +} + +template <typename CurrentT, typename PreviousT> +__device__ void CuCollision::collideConvexes(CurrentT& current, PreviousT& previous, float alpha) +{ + if(!gClothData.mNumConvexes) + return; + + // interpolate planes and transpose + if(threadIdx.x < gClothData.mNumPlanes * 4) + { + float start = gFrameData.mStartCollisionPlanes[threadIdx.x]; + float target = gFrameData.mTargetCollisionPlanes[threadIdx.x]; + int32_t j = threadIdx.x % 4 * gClothData.mNumPlanes + threadIdx.x / 4; + mCurData.mSphereX[j] = start + (target - start) * alpha; + } + + __syncthreads(); + + bool frictionEnabled = gClothData.mFrictionScale > 0.0f; + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + { + typename CurrentT::VectorType curPos = current(i); + + float3 delta; + if(int32_t numCollisions = collideConvexes(curPos, delta)) + { + float scale = __fdividef(1.0f, numCollisions); + + if(frictionEnabled) + { + typename PreviousT::VectorType prevPos = previous(i); + + float3 frictionImpulse = + calcFrictionImpulse(prevPos, curPos, make_float3(0.0f, 0.0f, 0.0f), scale, delta); + + prevPos.x -= frictionImpulse.x; + prevPos.y -= frictionImpulse.y; + prevPos.z -= frictionImpulse.z; + + previous(i) = prevPos; + } + + curPos.x += delta.x * scale; + curPos.y += delta.y * scale; + curPos.z += delta.z * scale; + + current(i) = curPos; + } + } + + __syncthreads(); +} + +namespace +{ +struct TriangleData +{ + float baseX, baseY, baseZ; + float edge0X, edge0Y, edge0Z; + float edge1X, edge1Y, edge1Z; + float normalX, normalY, normalZ; + + float edge0DotEdge1; + float edge0SqrLength; + float edge1SqrLength; + + float det; + float denom; + + float edge0InvSqrLength; + float edge1InvSqrLength; + + // initialize struct after vertices have been stored in first 9 members + __device__ void initialize() + { + edge0X -= baseX, edge0Y -= baseY, edge0Z -= baseZ; + edge1X -= baseX, edge1Y -= baseY, edge1Z -= baseZ; + + normalX = edge0Y * edge1Z - edge0Z * edge1Y; + normalY = edge0Z * edge1X - edge0X * edge1Z; + normalZ = edge0X * edge1Y - edge0Y * edge1X; + + float normalInvLength = rsqrtf(normalX * normalX + normalY * normalY + normalZ * normalZ); + normalX *= normalInvLength; + normalY *= normalInvLength; + normalZ *= normalInvLength; + + edge0DotEdge1 = edge0X * edge1X + edge0Y * edge1Y + edge0Z * edge1Z; + edge0SqrLength = edge0X * edge0X + edge0Y * edge0Y + edge0Z * edge0Z; + edge1SqrLength = edge1X * edge1X + edge1Y * edge1Y + edge1Z * edge1Z; + + det = __fdividef(1.0f, edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1); + denom = __fdividef(1.0f, edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1); + + edge0InvSqrLength = __fdividef(1.0f, edge0SqrLength); + edge1InvSqrLength = __fdividef(1.0f, edge1SqrLength); + } +}; +} + +template <typename CurrentT> +__device__ void CuCollision::collideTriangles(CurrentT& current, int32_t i) +{ + float posX = current(i, 0); + float posY = current(i, 1); + float posZ = current(i, 2); + + const TriangleData* __restrict tIt = reinterpret_cast<const TriangleData*>(generic(mCurData.mSphereX)); + const TriangleData* __restrict tEnd = tIt + gClothData.mNumCollisionTriangles; + + float normalX, normalY, normalZ, normalD = 0.0f; + float minSqrLength = FLT_MAX; + + for(; tIt != tEnd; ++tIt) + { + float dx = posX - tIt->baseX; + float dy = posY - tIt->baseY; + float dz = posZ - tIt->baseZ; + + float deltaDotEdge0 = dx * tIt->edge0X + dy * tIt->edge0Y + dz * tIt->edge0Z; + float deltaDotEdge1 = dx * tIt->edge1X + dy * tIt->edge1Y + dz * tIt->edge1Z; + float deltaDotNormal = dx * tIt->normalX + dy * tIt->normalY + dz * tIt->normalZ; + + float s = tIt->edge1SqrLength * deltaDotEdge0 - tIt->edge0DotEdge1 * deltaDotEdge1; + float t = tIt->edge0SqrLength * deltaDotEdge1 - tIt->edge0DotEdge1 * deltaDotEdge0; + + s = t > 0.0f ? s * tIt->det : deltaDotEdge0 * tIt->edge0InvSqrLength; + t = s > 0.0f ? t * tIt->det : deltaDotEdge1 * tIt->edge1InvSqrLength; + + if(s + t > 1.0f) + { + s = (tIt->edge1SqrLength - tIt->edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1) * tIt->denom; + } + + s = fmaxf(0.0f, fminf(1.0f, s)); + t = fmaxf(0.0f, fminf(1.0f - s, t)); + + dx = dx - tIt->edge0X * s - tIt->edge1X * t; + dy = dy - tIt->edge0Y * s - tIt->edge1Y * t; + dz = dz - tIt->edge0Z * s - tIt->edge1Z * t; + + float sqrLength = dx * dx + dy * dy + dz * dz; + + if(0.0f > deltaDotNormal) + sqrLength *= 1.0001f; + + if(sqrLength < minSqrLength) + { + normalX = tIt->normalX; + normalY = tIt->normalY; + normalZ = tIt->normalZ; + normalD = deltaDotNormal; + minSqrLength = sqrLength; + } + } + + if(normalD < 0.0f) + { + current(i, 0) = posX - normalX * normalD; + current(i, 1) = posY - normalY * normalD; + current(i, 2) = posZ - normalZ * normalD; + } +} + +namespace +{ +static const int32_t sTrianglePadding = sizeof(TriangleData) / sizeof(float) - 9; +} + +template <typename CurrentT> +__device__ void CuCollision::collideTriangles(CurrentT& current, float alpha) +{ + if(!gClothData.mNumCollisionTriangles) + return; + + // interpolate triangle vertices and store in shared memory + for(int32_t i = threadIdx.x, n = gClothData.mNumCollisionTriangles * 9; i < n; i += blockDim.x) + { + float start = gFrameData.mStartCollisionTriangles[i]; + float target = gFrameData.mTargetCollisionTriangles[i]; + int32_t idx = i * 7282 >> 16; // same as i/9 + int32_t offset = i + idx * sTrianglePadding; + mCurData.mSphereX[offset] = start + (target - start) * alpha; + } + + __syncthreads(); + + for(int32_t i = threadIdx.x; i < gClothData.mNumCollisionTriangles; i += blockDim.x) + { + reinterpret_cast<TriangleData*>(generic(mCurData.mSphereX))[i].initialize(); + } + + __syncthreads(); + + for(int32_t i = threadIdx.x; i < gClothData.mNumParticles; i += blockDim.x) + collideTriangles(current, i); + + __syncthreads(); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp new file mode 100644 index 00000000..2ccc3db9 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.cpp @@ -0,0 +1,54 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "cudamanager/PxCudaContextManager.h" +#include "CuContextLock.h" +#include "CuFactory.h" + +using namespace physx; + +cloth::CuContextLock::CuContextLock(const CuFactory& factory) : mFactory(factory) +{ + acquire(); +} + +cloth::CuContextLock::~CuContextLock() +{ + release(); +} + +void cloth::CuContextLock::acquire() +{ + mFactory.mContextManager->acquireContext(); +} + +void cloth::CuContextLock::release() +{ + mFactory.mContextManager->releaseContext(); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h new file mode 100644 index 00000000..50e48b49 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuContextLock.h @@ -0,0 +1,57 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +namespace physx +{ + +namespace cloth +{ + +class CuFactory; + +// acquires cuda context for the lifetime of the instance +class CuContextLock +{ + protected: + CuContextLock(const CuContextLock&); + CuContextLock& operator=(const CuContextLock&); + + public: + CuContextLock(const CuFactory&); + ~CuContextLock(); + + void acquire(); + void release(); + + const CuFactory& mFactory; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h new file mode 100644 index 00000000..cb37b39d --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDevicePointer.h @@ -0,0 +1,216 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <cuda.h> +#include "CuCheckSuccess.h" + +namespace physx +{ + +namespace cloth +{ + +template <typename T> +struct RemoveConst +{ + typedef T Type; +}; +template <typename T> +struct RemoveConst<const T> +{ + typedef T Type; +}; + +template <typename> +class CuDeviceReference; // forward declare + +// pointer to POD type in CUDA device memory +template <typename T> +class CuDevicePointer +{ + template <typename> + friend class CuDevicePointer; + + typedef typename RemoveConst<T>::Type ValueType; + + public: + // c'tors + CuDevicePointer() : mPointer(0) + { + } + template <class U> + explicit CuDevicePointer(U* ptr) + : mPointer(ptr) + { + } + CuDevicePointer(const CuDevicePointer<ValueType>& ptr) : mPointer(ptr.get()) + { + } + + // conversion + template <typename U> + operator CuDevicePointer<U>(void) const + { + return CuDevicePointer<U>(static_cast<U*>(mPointer)); + } + T* get() const + { + return mPointer; + } + CUdeviceptr dev() const + { + return reinterpret_cast<CUdeviceptr>(mPointer); + } + + // operators + CuDevicePointer operator+(const ptrdiff_t& rhs) const + { + return CuDevicePointer(mPointer + rhs); + } + CuDevicePointer operator-(const ptrdiff_t& rhs) const + { + return CuDevicePointer(mPointer - rhs); + } + CuDevicePointer& operator++(void) + { + ++mPointer; + return *this; + } + CuDevicePointer operator++(int) + { + CuDevicePointer copy(*this); + ++(*this); + return copy; + } + CuDevicePointer& operator--(void) + { + --mPointer; + return *this; + } + CuDevicePointer operator--(int) + { + CuDevicePointer copy(*this); + --(*this); + return copy; + } + CuDevicePointer& operator+=(ptrdiff_t rhs) + { + mPointer += rhs; + return *this; + } + CuDevicePointer& operator-=(ptrdiff_t rhs) + { + mPointer -= rhs; + return *this; + } + ptrdiff_t operator-(const CuDevicePointer& rhs) const + { + return mPointer - rhs.mPointer; + } + + template <typename U> + bool operator==(const CuDevicePointer<U>& other) const + { + return mPointer == other.mPointer; + } + template <typename U> + bool operator!=(const CuDevicePointer<U>& other) const + { + return mPointer != other.mPointer; + } + + // dereference + CuDeviceReference<T> operator[](const ptrdiff_t&) const; // (implemented below) + CuDeviceReference<T> operator*(void) const + { + return operator[](0); + } + + private: + T* mPointer; +}; + +template <typename T> +class CuDeviceReference +{ + template <typename> + friend class CuDeviceReference; + template <typename> + friend class CuDevicePointer; + + typedef typename RemoveConst<T>::Type ValueType; + + template <typename U> + CuDeviceReference(CuDevicePointer<U> pointer) + : mPointer(static_cast<T*>(pointer.get())) + { + } + + public: + template <typename U> + CuDeviceReference(CuDeviceReference<U> reference) + : mPointer(static_cast<T*>(reference.mPointer)) + { + } + + CuDevicePointer<T> operator&() const + { + return CuDevicePointer<T>(mPointer); + } + + CuDeviceReference& operator=(const T& v) + { + checkSuccess(cuMemcpyHtoD(CUdeviceptr(mPointer), &v, sizeof(T))); + return *this; + } + CuDeviceReference& operator=(const CuDeviceReference& ref) + { + checkSuccess(cuMemcpyDtoD(CUdeviceptr(mPointer), CUdeviceptr(ref.mPointer), sizeof(T))); + return *this; + } + operator ValueType() const + { + ValueType result; + checkSuccess(cuMemcpyDtoH(&result, CUdeviceptr(mPointer), sizeof(T))); + return result; + } + + private: + T* mPointer; +}; +} + +template <typename T> +cloth::CuDeviceReference<T> cloth::CuDevicePointer<T>::operator[](const ptrdiff_t& i) const +{ + return CuDeviceReference<T>(*this + i); +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h new file mode 100644 index 00000000..e3997d26 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuDeviceVector.h @@ -0,0 +1,258 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "foundation/PxMath.h" // for swap +#include "cudamanager/PxCudaMemoryManager.h" +#include "cudamanager/PxCudaContextManager.h" +#include "CuDevicePointer.h" +#include "PsArray.h" +#include "PsUtilities.h" + +namespace physx +{ +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch +#endif + +namespace cloth +{ + +// STL-style vector that holds POD types in CUDA device memory. The interface +// is not complete, add whatever you need from the std::vector interface. +template <typename T> +class CuDeviceVector +{ + public: + typedef CuDevicePointer<T> iterator; + typedef CuDevicePointer<const T> const_iterator; + + CuDeviceVector(physx::PxCudaContextManager* ctx) : mManager(0) + { + PX_ASSERT(ctx); + + if(ctx) + mManager = ctx->getMemoryManager(); + } + + CuDeviceVector(const CuDeviceVector& other) : mManager(other.getMemoryManager()) + { + PX_ASSERT(mManager); + + operator=(other); + } + + CuDeviceVector(physx::PxCudaContextManager* ctx, const T* first, const T* last) : mManager(0) + { + PX_ASSERT(ctx); + + if(ctx) + { + mManager = ctx->getMemoryManager(); + assign(first, last); + } + } + + template <typename Alloc> + CuDeviceVector(const shdfnd::Array<T, Alloc>& other) + { + operator=(other); + } + + ~CuDeviceVector() + { + PX_ASSERT(mManager); + + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, mFirst.dev()); + } + + CuDeviceVector& operator=(const CuDeviceVector& other) + { + resize(other.size()); + checkSuccess(cuMemcpyDtoD(mFirst.dev(), other.mFirst.dev(), other.size() * sizeof(T))); + return *this; + } + + template <typename Alloc> + CuDeviceVector& operator=(const shdfnd::Array<T, Alloc>& other) + { + const T* first = other.empty() ? 0 : &other.front(); + assign(first, first + other.size()); + return *this; + } + + bool empty() const + { + return mLast == mFirst; + } + size_t size() const + { + return size_t(mLast - mFirst); + } + size_t capacity() const + { + return mEnd - mFirst; + } + + iterator begin() + { + return mFirst; + } + iterator end() + { + return mLast; + } + const_iterator begin() const + { + return mFirst; + } + const_iterator end() const + { + return mLast; + } + + void push_back(const T& v) + { + if(mLast == mEnd) + reserve(PxMax<size_t>(1, capacity() * 2)); + + *mLast++ = v; + } + + void push_back(const T* first, const T* last) + { + if(mEnd - mLast < last - first) + reserve(PxMax<size_t>(2 * capacity(), mLast - mFirst + last - first)); + + if(first != last) + checkSuccess(cuMemcpyHtoD(mLast.dev(), first, sizeof(T) * (last - first))); + + mLast += last - first; + } + + void erase(iterator it) + { + size_t byteSize = (mLast - it - 1) * sizeof(T); + if(byteSize) + { + CUdeviceptr tmp = 0, dst = it.dev(); + + PX_ASSERT(mManager); + + tmp = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, byteSize, + PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH)); + checkSuccess(cuMemcpyDtoD(tmp, dst + sizeof(T), byteSize)); + checkSuccess(cuMemcpyDtoD(dst, tmp, byteSize)); + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, tmp); + } + --mLast; + } + + void reserve(size_t n) + { + if(n <= capacity()) + return; + + CUdeviceptr newFirst = 0, oldFirst = mFirst.dev(); + + PX_ASSERT(mManager); + + newFirst = mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, sizeof(T) * n, + PX_ALLOC_INFO("cloth::CuDeviceVector::T_GPU", CLOTH)); + checkSuccess(cuMemcpyDtoD(newFirst, oldFirst, sizeof(T) * size())); + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, oldFirst); + + iterator first(reinterpret_cast<T*>(newFirst)); + mEnd = first + n; + mLast = first + size(); + mFirst = first; + } + + void resize(size_t n) + { + if(capacity() < n) + reserve(PxMax(n, capacity() * 2)); + + mLast = mFirst + n; + } + + void assign(const T* first, const T* last) + { + size_t n = last - first; + resize(n); + checkSuccess(cuMemcpyHtoD(mFirst.dev(), first, n * sizeof(T))); + } + + void swap(CuDeviceVector& other) + { + shdfnd::swap(mFirst, other.mFirst); + shdfnd::swap(mLast, other.mLast); + shdfnd::swap(mEnd, other.mEnd); + } + + // match PxArray interface + void remove(size_t i) + { + erase(begin() + i); + } + void pushBack(const T& v) + { + push_back(v); + } + + physx::PxCudaMemoryManager* getMemoryManager() const + { + return mManager; + } + + private: + iterator mFirst, mLast, mEnd; + physx::PxCudaMemoryManager* mManager; +}; + +} // namespace cloth +} // namespace physx + +#if PX_VC +#pragma warning(pop) +#endif + +namespace physx +{ +namespace shdfnd +{ +template <typename T> +void swap(physx::cloth::CuDeviceVector<T>& first, physx::cloth::CuDeviceVector<T>& second) +{ + first.swap(second); +} +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp new file mode 100644 index 00000000..7f8326fe --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.cpp @@ -0,0 +1,197 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "CuFabric.h" +#include "CuContextLock.h" +#include "CuFactory.h" + +#if PX_VC +#pragma warning(disable : 4365) // 'action' : conversion from 'type_1' to 'type_2', signed/unsigned mismatch +#endif + +using namespace physx; +using namespace shdfnd; + +cloth::CuTether::CuTether(uint16_t anchor, uint16_t length) : mAnchor(anchor), mLength(length) +{ +} + +cloth::CuFabric::CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths, + Range<const uint32_t> triangles, uint32_t id) +: CuContextLock(factory) +, mFactory(factory) +, mNumParticles(numParticles) +, mPhases(mFactory.mContextManager, phases.begin(), phases.end()) +, mSets(mFactory.mContextManager) +, mRestvalues(mFactory.mContextManager, restvalues.begin(), restvalues.end()) +, mIndices(mFactory.mContextManager) +, mTethers(mFactory.mContextManager) +, mTriangles(mFactory.mContextManager) +, mId(id) +{ + // should no longer be prefixed with 0 + PX_ASSERT(sets.front() != 0); + + PX_ASSERT(sets.back() == restvalues.size()); + PX_ASSERT(restvalues.size() * 2 == indices.size()); + PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end())); + + // copy to device, add leading zero + mSets.reserve(sets.size() + 1); + mSets.push_back(0); + mSets.push_back(sets.begin(), sets.end()); + + // manually convert uint32_t indices to uint16_t in temp memory + Vector<uint16_t>::Type hostIndices; + hostIndices.resizeUninitialized(indices.size()); + Vector<uint16_t>::Type::Iterator dIt = hostIndices.begin(); + + const uint32_t* it = indices.begin(); + const uint32_t* end = indices.end(); + for(; it != end; ++it, ++dIt) + *dIt = uint16_t(*it); + + // copy to device vector in one go + mIndices.assign(hostIndices.begin(), hostIndices.end()); + + // gather data per phase + mNumConstraintsInPhase.reserve(phases.size()); + CuDevicePointer<const float> devRestvalues = mRestvalues.begin(); + CuDevicePointer<const uint16_t> devIndices = mIndices.begin(); + for(const uint32_t* pIt = phases.begin(); pIt != phases.end(); ++pIt) + { + uint32_t setIndex = *pIt; + uint32_t firstIndex = setIndex ? sets[setIndex - 1] : 0; + uint32_t lastIndex = sets[setIndex]; + mNumConstraintsInPhase.pushBack(lastIndex - firstIndex); + mRestvaluesInPhase.pushBack(devRestvalues + firstIndex); + mIndicesInPhase.pushBack(devIndices + 2 * firstIndex); + } + + // tethers + PX_ASSERT(anchors.size() == tetherLengths.size()); + mTetherLengthScale = + tetherLengths.empty() ? 1.0f : *maxElement(tetherLengths.begin(), tetherLengths.end()) / USHRT_MAX; + float inverseScale = 1 / (mTetherLengthScale + FLT_EPSILON); + Vector<CuTether>::Type tethers; + tethers.reserve(anchors.size()); + for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront()) + { + tethers.pushBack(CuTether(uint16_t(anchors.front()), uint16_t(tetherLengths.front() * inverseScale + 0.5f))); + } + mTethers.assign(tethers.begin(), tethers.end()); + + // triangles + hostIndices.resizeUninitialized(triangles.size()); + dIt = hostIndices.begin(); + + it = triangles.begin(); + end = triangles.end(); + for(; it != end; ++it, ++dIt) + *dIt = uint16_t(*it); + + mTriangles.assign(hostIndices.begin(), hostIndices.end()); + + CuContextLock::release(); + + // add to factory + mFactory.mFabrics.pushBack(this); +} + +cloth::CuFabric::~CuFabric() +{ + CuContextLock::acquire(); + + Vector<CuFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this); + + PX_ASSERT(fIt != mFactory.mFabrics.end()); + mFactory.mFabrics.replaceWithLast(fIt); +} + +cloth::Factory& physx::cloth::CuFabric::getFactory() const +{ + return mFactory; +} + +uint32_t cloth::CuFabric::getNumPhases() const +{ + return uint32_t(mPhases.size()); +} + +uint32_t cloth::CuFabric::getNumRestvalues() const +{ + return uint32_t(mRestvalues.size()); +} + +uint32_t cloth::CuFabric::getNumSets() const +{ + return uint32_t(mSets.size() - 1); +} + +uint32_t cloth::CuFabric::getNumIndices() const +{ + return uint32_t(mIndices.size()); +} + +uint32_t cloth::CuFabric::getNumParticles() const +{ + return mNumParticles; +} + +uint32_t physx::cloth::CuFabric::getNumTethers() const +{ + return uint32_t(mTethers.size()); +} + +uint32_t physx::cloth::CuFabric::getNumTriangles() const +{ + return uint32_t(mTriangles.size()) / 3; +} + +void physx::cloth::CuFabric::scaleRestvalues(float scale) +{ + CuContextLock contextLock(mFactory); + + Vector<float>::Type restvalues(uint32_t(mRestvalues.size())); + mFactory.copyToHost(mRestvalues.begin().get(), mRestvalues.end().get(), restvalues.begin()); + + Vector<float>::Type::Iterator rIt, rEnd = restvalues.end(); + for(rIt = restvalues.begin(); rIt != rEnd; ++rIt) + *rIt *= scale; + + mRestvalues = restvalues; +} + +void physx::cloth::CuFabric::scaleTetherLengths(float scale) +{ + // cloth instances won't pick this up until CuClothData is dirty! + mTetherLengthScale *= scale; +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h new file mode 100644 index 00000000..93f787f8 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFabric.h @@ -0,0 +1,102 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Fabric.h" +#include "Range.h" +#include "Types.h" +#include "Allocator.h" +#include "CuContextLock.h" +#include "CuDeviceVector.h" + +namespace physx +{ + +namespace cloth +{ + +struct CuTether +{ + CuTether(uint16_t, uint16_t); + uint16_t mAnchor; + uint16_t mLength; +}; + +class CuFabric : public UserAllocated, private CuContextLock, public Fabric +{ + PX_NOCOPY(CuFabric) + public: + CuFabric(CuFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths, Range<const uint32_t> triangles, uint32_t id); + + virtual ~CuFabric(); + + virtual Factory& getFactory() const; + + virtual uint32_t getNumPhases() const; + virtual uint32_t getNumRestvalues() const; + + virtual uint32_t getNumSets() const; + virtual uint32_t getNumIndices() const; + + virtual uint32_t getNumParticles() const; + + virtual uint32_t getNumTethers() const; + + virtual uint32_t getNumTriangles() const; + + virtual void scaleRestvalues(float); + virtual void scaleTetherLengths(float); + + public: + CuFactory& mFactory; + + uint32_t mNumParticles; + + CuDeviceVector<uint32_t> mPhases; // index of set to use + CuDeviceVector<uint32_t> mSets; // offset of first restvalue, with 0 prefix + + CuDeviceVector<float> mRestvalues; + CuDeviceVector<uint16_t> mIndices; + + CuDeviceVector<CuTether> mTethers; + float mTetherLengthScale; + + CuDeviceVector<uint16_t> mTriangles; + + Vector<uint32_t>::Type mNumConstraintsInPhase; + Vector<CuDevicePointer<const float> >::Type mRestvaluesInPhase; + Vector<CuDevicePointer<const uint16_t> >::Type mIndicesInPhase; + + uint32_t mId; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp new file mode 100644 index 00000000..8847780e --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.cpp @@ -0,0 +1,398 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxMemory.h" +#include "CuFactory.h" +#include "CuFabric.h" +#include "CuCloth.h" +#include "CuSolver.h" +#include "ClothImpl.h" +#include "CuCheckSuccess.h" +#include "CuContextLock.h" +#include "PsAllocator.h" +#include "Array.h" +#include "PsFoundation.h" +#include <cuda.h> + +#if PX_VC +#pragma warning(disable : 4061 4062) // enumerator 'identifier' in switch of enum 'enumeration' is not handled +#endif + +using namespace physx; +using namespace shdfnd; + +namespace physx +{ +namespace cloth +{ +// defined in Factory.cpp +uint32_t getNextFabricId(); + +typedef Vec4T<uint32_t> Vec4u; +} +} + +void cloth::checkSuccessImpl(CUresult err, const char* file, const int line) +{ + if(err != CUDA_SUCCESS) + { + const char* code = "Unknown"; + switch(err) + { +#define ADD_CASE(X) \ + case X: \ + code = #X; \ + break + ADD_CASE(CUDA_ERROR_INVALID_VALUE); + ADD_CASE(CUDA_ERROR_OUT_OF_MEMORY); + ADD_CASE(CUDA_ERROR_NOT_INITIALIZED); + ADD_CASE(CUDA_ERROR_DEINITIALIZED); + ADD_CASE(CUDA_ERROR_NO_DEVICE); + ADD_CASE(CUDA_ERROR_INVALID_DEVICE); + ADD_CASE(CUDA_ERROR_INVALID_IMAGE); + ADD_CASE(CUDA_ERROR_INVALID_CONTEXT); + ADD_CASE(CUDA_ERROR_MAP_FAILED); + ADD_CASE(CUDA_ERROR_UNMAP_FAILED); + ADD_CASE(CUDA_ERROR_ARRAY_IS_MAPPED); + ADD_CASE(CUDA_ERROR_ALREADY_MAPPED); + ADD_CASE(CUDA_ERROR_NO_BINARY_FOR_GPU); + ADD_CASE(CUDA_ERROR_ALREADY_ACQUIRED); + ADD_CASE(CUDA_ERROR_NOT_MAPPED); + ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY); + ADD_CASE(CUDA_ERROR_NOT_MAPPED_AS_POINTER); + ADD_CASE(CUDA_ERROR_ECC_UNCORRECTABLE); + ADD_CASE(CUDA_ERROR_UNSUPPORTED_LIMIT); + ADD_CASE(CUDA_ERROR_INVALID_SOURCE); + ADD_CASE(CUDA_ERROR_FILE_NOT_FOUND); + ADD_CASE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND); + ADD_CASE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED); + ADD_CASE(CUDA_ERROR_OPERATING_SYSTEM); + ADD_CASE(CUDA_ERROR_INVALID_HANDLE); + ADD_CASE(CUDA_ERROR_NOT_FOUND); + ADD_CASE(CUDA_ERROR_NOT_READY); + ADD_CASE(CUDA_ERROR_LAUNCH_FAILED); + ADD_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); + ADD_CASE(CUDA_ERROR_LAUNCH_TIMEOUT); + ADD_CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING); + default: + ADD_CASE(CUDA_ERROR_UNKNOWN); +#undef ADD_CASE + } + shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, file, line, "CUDA error: %s", code); + } +} + +namespace +{ +// returns max threads as specified by launch bounds in CuSolverKernel.cu +uint32_t getMaxThreadsPerBlock(const physx::PxCudaContextManager& contextManager) +{ + if(contextManager.supportsArchSM30()) + return 1024; + + if(contextManager.supportsArchSM20()) + return 512; + + return 192; +} +} + +cloth::CuFactory::CuFactory(physx::PxCudaContextManager* contextManager) +: Factory(CUDA) +, mContextManager(contextManager) +, mNumThreadsPerBlock(getMaxThreadsPerBlock(*contextManager)) +, mMaxThreadsPerBlock(mNumThreadsPerBlock) +{ +} + +cloth::CuFactory::~CuFactory() +{ +} + +cloth::Fabric* cloth::CuFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases, + Range<const uint32_t> sets, Range<const float> restvalues, + Range<const uint32_t> indices, Range<const uint32_t> anchors, + Range<const float> tetherLengths, Range<const uint32_t> triangles) +{ + return new CuFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths, triangles, + getNextFabricId()); +} + +cloth::Cloth* cloth::CuFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric) +{ + return new CuClothImpl(*this, fabric, particles); +} + +cloth::Solver* cloth::CuFactory::createSolver(physx::PxTaskManager*) +{ + CuSolver* solver = new CuSolver(*this); + + if(solver->hasError()) + { + delete solver; + return NULL; + } + + return solver; +} + +// CuFactory::clone() implemented in CuClothClone.cpp + +void cloth::CuFactory::copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const +{ + CuContextLock contextLock(*this); + + checkSuccess(cuMemcpyDtoH(dstIt, CUdeviceptr(srcIt), size_t(intptr_t(srcEnd) - intptr_t(srcIt)))); +} + +void cloth::CuFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths, Range<uint32_t> triangles) const +{ + CuContextLock contextLock(*this); + + const CuFabric& cuFabric = static_cast<const CuFabric&>(fabric); + + if(!phases.empty()) + { + PX_ASSERT(phases.size() == cuFabric.mPhases.size()); + const uint32_t* devicePhases = cuFabric.mPhases.begin().get(); + copyToHost(devicePhases, devicePhases + cuFabric.mPhases.size(), phases.begin()); + } + + if(!restvalues.empty()) + { + PX_ASSERT(restvalues.size() == cuFabric.mRestvalues.size()); + const float* deviceRestvalues = cuFabric.mRestvalues.begin().get(); + copyToHost(deviceRestvalues, deviceRestvalues + cuFabric.mRestvalues.size(), restvalues.begin()); + } + + if(!sets.empty()) + { + PX_ASSERT(sets.size() == cuFabric.mSets.size() - 1); + const uint32_t* deviceSets = cuFabric.mSets.begin().get(); + copyToHost(deviceSets + 1, deviceSets + cuFabric.mSets.size(), sets.begin()); + } + + if(!indices.empty()) + { + PX_ASSERT(indices.size() == cuFabric.mIndices.size()); + const uint16_t* deviceIndices = cuFabric.mIndices.begin().get(); + uint16_t* hostIndices = reinterpret_cast<uint16_t*>(indices.begin()); + copyToHost(deviceIndices, deviceIndices + cuFabric.mIndices.size(), hostIndices); + + // convert from 16bit to 32bit indices + for(uint32_t i = indices.size(); 0 < i--;) + indices[i] = hostIndices[i]; + } + + if(!anchors.empty() || !tetherLengths.empty()) + { + uint32_t numTethers = uint32_t(cuFabric.mTethers.size()); + Vector<CuTether>::Type tethers(numTethers, CuTether(0, 0)); + const CuTether* deviceTethers = cuFabric.mTethers.begin().get(); + copyToHost(deviceTethers, deviceTethers + numTethers, tethers.begin()); + + PX_ASSERT(anchors.empty() || anchors.size() == tethers.size()); + for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront()) + anchors.front() = tethers[i].mAnchor; + + PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == tethers.size()); + for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront()) + tetherLengths.front() = tethers[i].mLength * cuFabric.mTetherLengthScale; + } + + if(!triangles.empty()) + { + // todo triangles + } +} + +void cloth::CuFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + PX_ASSERT(spheres.empty() || spheres.size() == cuCloth.mStartCollisionSpheres.size()); + PX_ASSERT(capsules.empty() || capsules.size() == cuCloth.mCapsuleIndices.size() * 2); + PX_ASSERT(planes.empty() || planes.size() == cuCloth.mStartCollisionPlanes.size()); + PX_ASSERT(convexes.empty() || convexes.size() == cuCloth.mConvexMasks.size()); + PX_ASSERT(triangles.empty() || triangles.size() == cuCloth.mStartCollisionTriangles.size()); + + // collision spheres are in pinned memory, so memcpy directly + if(!cuCloth.mStartCollisionSpheres.empty() && !spheres.empty()) + memcpy(spheres.begin(), &cuCloth.mStartCollisionSpheres.front(), + cuCloth.mStartCollisionSpheres.size() * sizeof(PxVec4)); + + if(!cuCloth.mCapsuleIndices.empty() && !capsules.empty()) + memcpy(capsules.begin(), &cuCloth.mCapsuleIndices.front(), cuCloth.mCapsuleIndices.size() * sizeof(IndexPair)); + + if(!cuCloth.mStartCollisionPlanes.empty() && !planes.empty()) + memcpy(planes.begin(), &cuCloth.mStartCollisionPlanes.front(), + cuCloth.mStartCollisionPlanes.size() * sizeof(PxVec4)); + + if(!cuCloth.mConvexMasks.empty() && !convexes.empty()) + memcpy(convexes.begin(), &cuCloth.mConvexMasks.front(), cuCloth.mConvexMasks.size() * sizeof(uint32_t)); + + if(!cuCloth.mStartCollisionTriangles.empty() && !triangles.empty()) + memcpy(triangles.begin(), &cuCloth.mStartCollisionTriangles.front(), + cuCloth.mStartCollisionTriangles.size() * sizeof(PxVec3)); +} + +void cloth::CuFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(cuCloth.mMotionConstraints.mHostCopy.size()) + { + PX_ASSERT(destConstraints.size() == cuCloth.mMotionConstraints.mHostCopy.size()); + + PxMemCopy(destConstraints.begin(), cuCloth.mMotionConstraints.mHostCopy.begin(), + sizeof(PxVec4) * cuCloth.mMotionConstraints.mHostCopy.size()); + } + else + { + CuContextLock contextLock(*this); + + CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mMotionConstraints.mTarget.empty() + ? cuCloth.mMotionConstraints.mTarget + : cuCloth.mMotionConstraints.mStart; + + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin()); + } +} + +void cloth::CuFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(cuCloth.mSeparationConstraints.mHostCopy.size()) + { + PX_ASSERT(destConstraints.size() == cuCloth.mSeparationConstraints.mHostCopy.size()); + + PxMemCopy(destConstraints.begin(), cuCloth.mSeparationConstraints.mHostCopy.begin(), + sizeof(PxVec4) * cuCloth.mSeparationConstraints.mHostCopy.size()); + } + else + { + CuContextLock contextLock(*this); + + CuDeviceVector<PxVec4> const& srcConstraints = !cuCloth.mSeparationConstraints.mTarget.empty() + ? cuCloth.mSeparationConstraints.mTarget + : cuCloth.mSeparationConstraints.mStart; + + PX_ASSERT(destConstraints.size() == srcConstraints.size()); + + copyToHost(srcConstraints.begin().get(), srcConstraints.end().get(), destConstraints.begin()); + } +} + +void cloth::CuFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(cuCloth.mParticleAccelerationsHostCopy.size()) + { + PX_ASSERT(destAccelerations.size() == cuCloth.mParticleAccelerationsHostCopy.size()); + + PxMemCopy(destAccelerations.begin(), cuCloth.mParticleAccelerationsHostCopy.begin(), + sizeof(PxVec4) * cuCloth.mParticleAccelerationsHostCopy.size()); + } +} + +void cloth::CuFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const +{ + PX_ASSERT(&cloth.getFactory() == this); + + CuContextLock contextLock(*this); + + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + + if(destWeights.size() > 0) + { + uint32_t numWeights = cloth.getNumVirtualParticleWeights(); + + Vector<PxVec4>::Type hostWeights(numWeights, PxVec4(0.0f)); + copyToHost(cuCloth.mVirtualParticleWeights.begin().get(), cuCloth.mVirtualParticleWeights.end().get(), + &hostWeights.front()); + + // convert weights to Vec3f + PxVec3* destIt = reinterpret_cast<PxVec3*>(destWeights.begin()); + Vector<PxVec4>::Type::ConstIterator srcIt = hostWeights.begin(); + Vector<PxVec4>::Type::ConstIterator srcEnd = srcIt + numWeights; + for(; srcIt != srcEnd; ++srcIt, ++destIt) + *destIt = reinterpret_cast<const PxVec3&>(*srcIt); + + PX_ASSERT(destIt <= destWeights.end()); + } + + if(destIndices.size() > 0) + { + uint32_t numIndices = cloth.getNumVirtualParticles(); + + Vector<Vec4us>::Type hostIndices(numIndices); + copyToHost(cuCloth.mVirtualParticleIndices.begin().get(), cuCloth.mVirtualParticleIndices.end().get(), + &hostIndices.front()); + + // convert indices to 32 bit + Vec4u* destIt = reinterpret_cast<Vec4u*>(destIndices.begin()); + Vector<Vec4us>::Type::ConstIterator srcIt = hostIndices.begin(); + Vector<Vec4us>::Type::ConstIterator srcEnd = srcIt + numIndices; + for(; srcIt != srcEnd; ++srcIt, ++destIt) + *destIt = Vec4u(*srcIt); + + PX_ASSERT(&array(*destIt) <= destIndices.end()); + } +} + +void cloth::CuFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const +{ + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + PX_ASSERT(destIndices.size() == cuCloth.mSelfCollisionIndices.size()); + copyToHost(cuCloth.mSelfCollisionIndices.begin().get(), cuCloth.mSelfCollisionIndices.end().get(), + destIndices.begin()); +} + +void cloth::CuFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const +{ + const CuCloth& cuCloth = static_cast<const CuClothImpl&>(cloth).mCloth; + PX_ASSERT(destRestPositions.size() == cuCloth.mRestPositions.size()); + copyToHost(cuCloth.mRestPositions.begin().get(), cuCloth.mRestPositions.end().get(), destRestPositions.begin()); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h new file mode 100644 index 00000000..e868034f --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuFactory.h @@ -0,0 +1,107 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Factory.h" +#include "Allocator.h" + +namespace physx +{ +class PxCudaContextManager; +} + +namespace physx +{ + +namespace cloth +{ + +class CuFabric; +class CuCloth; +template <typename> +class ClothImpl; + +class CuFactory : public UserAllocated, public Factory +{ + protected: + CuFactory& operator=(const CuFactory&); + + public: + typedef CuFabric FabricType; + typedef ClothImpl<CuCloth> ImplType; + + CuFactory(physx::PxCudaContextManager*); + virtual ~CuFactory(); + + virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets, + Range<const float> restvalues, Range<const uint32_t> indices, + Range<const uint32_t> anchors, Range<const float> tetherLengths, + Range<const uint32_t> triangles); + + virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric); + + virtual Solver* createSolver(physx::PxTaskManager* taskMgr); + + virtual Cloth* clone(const Cloth& cloth); + + virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets, + Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors, + Range<float> tetherLengths, Range<uint32_t> triangles) const; + + virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules, + Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const; + + virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const; + + virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const; + + virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices, + Range<PxVec3> destWeights) const; + + virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const; + + virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const; + + public: + void copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const; + + public: + Vector<CuFabric*>::Type mFabrics; + + physx::PxCudaContextManager* mContextManager; + + uint32_t mNumThreadsPerBlock; + + const uint32_t mMaxThreadsPerBlock; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h new file mode 100644 index 00000000..74470bde --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPhaseConfig.h @@ -0,0 +1,51 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ +namespace cloth +{ + +struct CuPhaseConfig +{ + float mStiffness; + float mStiffnessMultiplier; + float mCompressionLimit; + float mStretchLimit; + + uint32_t mNumConstraints; + const float* mRestvalues; + const uint16_t* mIndices; +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h new file mode 100644 index 00000000..57dd6731 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuPinnedAllocator.h @@ -0,0 +1,132 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "cudamanager/PxCudaContextManager.h" +#include "cudamanager/PxCudaMemoryManager.h" +#include "Allocator.h" +#include "CuCheckSuccess.h" +#include <cuda.h> + +namespace physx +{ + +namespace cloth +{ + +struct CuHostAllocator +{ + CuHostAllocator(physx::PxCudaContextManager* ctx = NULL, unsigned int flags = cudaHostAllocDefault) + : mDevicePtr(0), mFlags(flags), mManager(0) + { + PX_ASSERT(ctx); + + if(ctx) + mManager = ctx->getMemoryManager(); + } + + void* allocate(size_t n, const char*, int) + { + physx::PxCudaBufferPtr bufferPtr; + + PX_ASSERT(mManager); + + if(mFlags & cudaHostAllocWriteCombined) + bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, n, + PX_ALLOC_INFO("cloth::CuHostAllocator::T_WRITE_COMBINED", CLOTH)); + else if(mFlags & cudaHostAllocMapped) + bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, n, + PX_ALLOC_INFO("cloth::CuHostAllocator::T_PINNED_HOST", CLOTH)); + else + bufferPtr = mManager->alloc(physx::PxCudaBufferMemorySpace::T_HOST, n, + PX_ALLOC_INFO("cloth::CuHostAllocator::T_HOST", CLOTH)); + + if(mFlags & cudaHostAllocMapped) + checkSuccess(cuMemHostGetDevicePointer(&mDevicePtr, reinterpret_cast<void*>(bufferPtr), 0)); + + return reinterpret_cast<void*>(bufferPtr); + } + + void deallocate(void* p) + { + PX_ASSERT(mManager); + + if(mFlags & cudaHostAllocWriteCombined) + mManager->free(physx::PxCudaBufferMemorySpace::T_WRITE_COMBINED, physx::PxCudaBufferPtr(p)); + else if(mFlags & cudaHostAllocMapped) + mManager->free(physx::PxCudaBufferMemorySpace::T_PINNED_HOST, physx::PxCudaBufferPtr(p)); + else + mManager->free(physx::PxCudaBufferMemorySpace::T_HOST, physx::PxCudaBufferPtr(p)); + + // don't reset mDevicePtr because Array::recreate deallocates last + } + + CUdeviceptr mDevicePtr; // device pointer of last allocation + unsigned int mFlags; + physx::PxCudaMemoryManager* mManager; +}; + +template <typename T> +CuHostAllocator getMappedAllocator(physx::PxCudaContextManager* ctx) +{ + return CuHostAllocator(ctx, cudaHostAllocMapped | cudaHostAllocWriteCombined); +} + +template <typename T> +struct CuPinnedVector +{ + // note: always use shdfnd::swap() instead of Array::swap() + // in order to keep cached device pointer consistent + typedef shdfnd::Array<T, typename physx::cloth::CuHostAllocator> Type; +}; + +template <typename T> +T* getDevicePointer(shdfnd::Array<T, typename physx::cloth::CuHostAllocator>& vector) +{ + // cached device pointer only valid if non-empty + return vector.empty() ? 0 : reinterpret_cast<T*>(vector.getAllocator().mDevicePtr); +} + +} // namespace cloth + +} // namespace physx + +namespace physx +{ +namespace shdfnd +{ +template <typename T> +void swap(Array<T, typename physx::cloth::CuHostAllocator>& left, Array<T, typename physx::cloth::CuHostAllocator>& right) +{ + swap(left.getAllocator(), right.getAllocator()); + left.swap(right); +} +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h new file mode 100644 index 00000000..fb0fd7af --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSelfCollision.h @@ -0,0 +1,472 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#ifndef CU_SOLVER_KERNEL_CU +#error include CuSelfCollision.h only from CuSolverKernel.cu +#endif + +#ifndef UINT16_MAX +#define UINT16_MAX 0xffff +#endif + +namespace +{ +#if __CUDA_ARCH__ >= 300 +template <int> +__device__ void scanWarp(Pointer<Shared, int32_t> counts) +{ + asm volatile("{" + " .reg .s32 tmp;" + " .reg .pred p;" + " shfl.up.b32 tmp|p, %0, 0x01, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x02, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x04, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x08, 0x0;" + "@p add.s32 %0, tmp, %0;" + " shfl.up.b32 tmp|p, %0, 0x10, 0x0;" + "@p add.s32 %0, tmp, %0;" + "}" + : "+r"(*generic(counts)) + :); +} +#else +template <int stride> +__device__ void scanWarp(Pointer<Shared, int32_t> counts) +{ + volatile int32_t* ptr = generic(counts); + const int32_t laneIdx = threadIdx.x & warpSize - 1; + if(laneIdx >= 1) + *ptr += ptr[-stride]; + if(laneIdx >= 2) + *ptr += ptr[-2 * stride]; + if(laneIdx >= 4) + *ptr += ptr[-4 * stride]; + if(laneIdx >= 8) + *ptr += ptr[-8 * stride]; + if(laneIdx >= 16) + *ptr += ptr[-16 * stride]; +} +#endif + +// sorts array by upper 16bits +// [keys] must be at least 2*n in length, in/out in first n elements +// [histogram] must be at least 34*16 = 544 in length +__device__ void radixSort(int32_t* keys, int32_t n, Pointer<Shared, int32_t> histogram) +{ + const int32_t numWarps = blockDim.x >> 5; + const int32_t warpIdx = threadIdx.x >> 5; + const int32_t laneIdx = threadIdx.x & warpSize - 1; + + const uint32_t laneMask = (1u << laneIdx) - 1; + const uint32_t mask1 = (threadIdx.x & 1) - 1; + const uint32_t mask2 = !!(threadIdx.x & 2) - 1; + const uint32_t mask4 = !!(threadIdx.x & 4) - 1; + const uint32_t mask8 = !!(threadIdx.x & 8) - 1; + + const int32_t tn = (n + blockDim.x - 1) / blockDim.x; + const int32_t startIndex = tn * (threadIdx.x - laneIdx) + laneIdx; + const int32_t endIndex = min(startIndex + tn * warpSize, n + 31 & ~31); // full warps for ballot + + int32_t* srcKeys = keys; + int32_t* dstKeys = keys + n; + + Pointer<Shared, int32_t> hIt = histogram + 16 * warpIdx; + Pointer<Shared, int32_t> pIt = histogram + 16 * laneIdx + 16; + Pointer<Shared, int32_t> tIt = histogram + 16 * numWarps + laneIdx; + + for(int32_t p = 16; p < 32; p += 4) // radix passes (4 bits each) + { + // gather bucket histograms per warp + int32_t warpCount = 0; + for(int32_t i = startIndex; i < endIndex; i += 32) + { + int32_t key = i < n ? srcKeys[i] >> p : 15; + uint32_t ballot1 = __ballot(key & 1); + uint32_t ballot2 = __ballot(key & 2); + uint32_t ballot4 = __ballot(key & 4); + uint32_t ballot8 = __ballot(key & 8); + warpCount += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8)); + } + + if(laneIdx >= 16) + hIt[laneIdx] = warpCount; + + __syncthreads(); + + // prefix sum of histogram buckets + for(int32_t i = warpIdx; i < 16; i += numWarps) + scanWarp<16>(pIt + i); + + __syncthreads(); + + // prefix sum of bucket totals (exclusive) + if(threadIdx.x < 16) + { + *tIt = tIt[-1] & !threadIdx.x - 1; + scanWarp<1>(tIt); + hIt[threadIdx.x] = 0; + } + + __syncthreads(); + + if(laneIdx < 16) + hIt[laneIdx] += *tIt; + + // split indices + for(int32_t i = startIndex; i < endIndex; i += 32) + { + int32_t key = i < n ? srcKeys[i] >> p : 15; + uint32_t ballot1 = __ballot(key & 1); + uint32_t ballot2 = __ballot(key & 2); + uint32_t ballot4 = __ballot(key & 4); + uint32_t ballot8 = __ballot(key & 8); + uint32_t bits = ((key & 1) - 1 ^ ballot1) & (!!(key & 2) - 1 ^ ballot2) & (!!(key & 4) - 1 ^ ballot4) & + (!!(key & 8) - 1 ^ ballot8); + int32_t index = hIt[key & 15] + __popc(bits & laneMask); + + if(i < n) + dstKeys[index] = srcKeys[i]; + + if(laneIdx < 16) + hIt[laneIdx] += __popc((mask1 ^ ballot1) & (mask2 ^ ballot2) & (mask4 ^ ballot4) & (mask8 ^ ballot8)); + } + + __syncthreads(); + + ::swap(srcKeys, dstKeys); + } + +#ifndef NDEBUG + for(int32_t i = threadIdx.x; i < n; i += blockDim.x) + assert(!i || keys[i - 1] >> 16 <= keys[i] >> 16); +#endif +} +} + +namespace +{ +struct CuSelfCollision +{ + template <typename CurrentT> + __device__ void operator()(CurrentT& current); + + private: + template <typename CurrentT> + __device__ void buildAcceleration(const CurrentT& current); + template <bool useRestPositions, typename CurrentT> + __device__ void collideParticles(CurrentT& current) const; + + public: + float mPosBias[3]; + float mPosScale[3]; + const float* mPosPtr[3]; +}; +} + +__shared__ uninitialized<CuSelfCollision> gSelfCollideParticles; + +template <typename CurrentT> +__device__ void CuSelfCollision::operator()(CurrentT& current) +{ + if(min(gClothData.mSelfCollisionDistance, gFrameData.mSelfCollisionStiffness) <= 0.0f) + return; + + if(threadIdx.x < 3) + { + float upper = gFrameData.mParticleBounds[threadIdx.x * 2]; + float negativeLower = gFrameData.mParticleBounds[threadIdx.x * 2 + 1]; + + // expand bounds + float eps = (upper + negativeLower) * 1e-4f; + float expandedUpper = upper + eps; + float expandedNegativeLower = negativeLower + eps; + float expandedEdgeLength = expandedUpper + expandedNegativeLower; + + float* edgeLength = mPosBias; // use as temp + edgeLength[threadIdx.x] = expandedEdgeLength; + + __threadfence_block(); + + // calculate shortest axis + int32_t shortestAxis = edgeLength[0] > edgeLength[1]; + if(edgeLength[shortestAxis] > edgeLength[2]) + shortestAxis = 2; + + uint32_t writeAxis = threadIdx.x - shortestAxis; + writeAxis += writeAxis >> 30; + + float maxInvCellSize = __fdividef(127.0f, expandedEdgeLength); + float invCollisionDistance = __fdividef(1.0f, gClothData.mSelfCollisionDistance); + float invCellSize = min(maxInvCellSize, invCollisionDistance); + + mPosScale[writeAxis] = invCellSize; + mPosBias[writeAxis] = invCellSize * expandedNegativeLower; + mPosPtr[writeAxis] = generic(current[threadIdx.x]); + } + + __syncthreads(); + + buildAcceleration(current); + + if(gFrameData.mRestPositions) + collideParticles<true>(current); + else + collideParticles<false>(current); +} + +template <typename CurrentT> +__device__ void CuSelfCollision::buildAcceleration(const CurrentT& current) +{ + int32_t numIndices = gClothData.mNumSelfCollisionIndices; + const int32_t* indices = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionIndices); + int32_t* sortedKeys = reinterpret_cast<int32_t*>(gClothData.mSelfCollisionKeys); + int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart); + + typedef typename CurrentT::ConstPointerType ConstPointerType; + ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]); + ConstPointerType colPtr = ConstPointerType(mPosPtr[2]); + + float rowScale = mPosScale[1], rowBias = mPosBias[1]; + float colScale = mPosScale[2], colBias = mPosBias[2]; + + // calculate keys + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t index = indices ? indices[i] : i; + assert(index < gClothData.mNumParticles); + + int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[index] * rowScale + rowBias, 127.5f))); + int32_t colIndex = int32_t(max(0.0f, min(colPtr[index] * colScale + colBias, 127.5f))); + assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128); + + int32_t key = (colIndex << 7 | rowIndex) + 129; // + row and column sentinel + assert(key <= 0x4080); + + sortedKeys[i] = key << 16 | index; // (key, index) pair in a single int32_t + } + __syncthreads(); + + // get scratch shared mem buffer used for radix sort(histogram) + Pointer<Shared, int32_t> buffer = + reinterpret_cast<Pointer<Shared, int32_t> const&>(gCollideParticles.get().mCurData.mSphereX); + + // sort keys (__synchthreads inside radix sort) + radixSort(sortedKeys, numIndices, buffer); + + // mark cell start if keys are different between neighboring threads + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t key = sortedKeys[i] >> 16; + int32_t prevKey = i ? sortedKeys[i - 1] >> 16 : key - 1; + if(key != prevKey) + { + cellStart[key] = i; + cellStart[prevKey + 1] = i; + } + } + __syncthreads(); +} + +template <bool useRestPositions, typename CurrentT> +__device__ void CuSelfCollision::collideParticles(CurrentT& current) const +{ + const int32_t* sortedKeys = reinterpret_cast<const int32_t*>(gClothData.mSelfCollisionKeys); + float* sortedParticles = gClothData.mSelfCollisionParticles; + int16_t* cellStart = reinterpret_cast<int16_t*>(gClothData.mSelfCollisionCellStart); + + const float cdist = gClothData.mSelfCollisionDistance; + const float cdistSq = cdist * cdist; + + const int32_t numIndices = gClothData.mNumSelfCollisionIndices; + const int32_t numParticles = gClothData.mNumParticles; + + // point to particle copied in device memory that is being updated + float* xPtr = sortedParticles; + float* yPtr = sortedParticles + numParticles; + float* zPtr = sortedParticles + 2 * numParticles; + float* wPtr = sortedParticles + 3 * numParticles; + + // copy current particles to temporary array + for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x) + { + xPtr[i] = current(i, 0); + yPtr[i] = current(i, 1); + zPtr[i] = current(i, 2); + wPtr[i] = current(i, 3); + } + __syncthreads(); + + // copy only sorted (indexed) particles to shared mem + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t index = sortedKeys[i] & UINT16_MAX; + current(i, 0) = xPtr[index]; + current(i, 1) = yPtr[index]; + current(i, 2) = zPtr[index]; + current(i, 3) = wPtr[index]; + } + __syncthreads(); + + typedef typename CurrentT::ConstPointerType ConstPointerType; + ConstPointerType rowPtr = ConstPointerType(mPosPtr[1]); + ConstPointerType colPtr = ConstPointerType(mPosPtr[2]); + + float rowScale = mPosScale[1], rowBias = mPosBias[1]; + float colScale = mPosScale[2], colBias = mPosBias[2]; + + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + const int32_t index = sortedKeys[i] & UINT16_MAX; + assert(index < gClothData.mNumParticles); + + float restX, restY, restZ; + if(useRestPositions) + { + const float* restIt = gFrameData.mRestPositions + index * 4; + restX = restIt[0]; + restY = restIt[1]; + restZ = restIt[2]; + } + + float posX = current(i, 0); + float posY = current(i, 1); + float posZ = current(i, 2); + float posW = current(i, 3); + + float deltaX = 0.0f; + float deltaY = 0.0f; + float deltaZ = 0.0f; + float deltaW = FLT_EPSILON; + + // get cell index for this particle + int32_t rowIndex = int32_t(max(0.0f, min(rowPtr[i] * rowScale + rowBias, 127.5f))); + int32_t colIndex = int32_t(max(0.0f, min(colPtr[i] * colScale + colBias, 127.5f))); + assert(rowIndex >= 0 && rowIndex < 128 && colIndex >= 0 && colIndex < 128); + + int32_t key = colIndex << 7 | rowIndex; + assert(key <= 0x4080); + + // check cells in 3 columns + for(int32_t keyEnd = key + 256; key <= keyEnd; key += 128) + { + // cellStart keys of unoccupied cells have a value of -1 + uint32_t startIndex; // min<unsigned>(cellStart[key+0..2]) + uint32_t endIndex; // max<signed>(0, cellStart[key+1..3]) + + asm volatile("{\n\t" + " .reg .u32 start1, start2;\n\t" + " ld.global.s16 %1, [%2+6];\n\t" + " ld.global.s16 %0, [%2+0];\n\t" + " ld.global.s16 start1, [%2+2];\n\t" + " ld.global.s16 start2, [%2+4];\n\t" + " max.s32 %1, %1, 0;\n\t" + " min.u32 %0, %0, start1;\n\t" + " max.s32 %1, %1, start1;\n\t" + " min.u32 %0, %0, start2;\n\t" + " max.s32 %1, %1, start2;\n\t" + "}\n\t" + : "=r"(startIndex), "=r"(endIndex) + : POINTER_CONSTRAINT(cellStart + key)); + + // comparison must be unsigned to skip cells with negative startIndex + for(uint32_t j = startIndex; j < endIndex; ++j) + { + if(j != i) // avoid same particle + { + float dx = posX - current(j, 0); + float dy = posY - current(j, 1); + float dz = posZ - current(j, 2); + + float distSqr = dx * dx + dy * dy + dz * dz; + if(distSqr > cdistSq) + continue; + + if(useRestPositions) + { + const int32_t jndex = sortedKeys[j] & UINT16_MAX; + assert(jndex < gClothData.mNumParticles); + + // calculate distance in rest configuration + const float* restJt = gFrameData.mRestPositions + jndex * 4; + float rx = restX - restJt[0]; + float ry = restY - restJt[1]; + float rz = restZ - restJt[2]; + + if(rx * rx + ry * ry + rz * rz <= cdistSq) + continue; + } + + // premultiply ratio for weighted average + float ratio = fmaxf(0.0f, cdist * rsqrtf(FLT_EPSILON + distSqr) - 1.0f); + float scale = __fdividef(ratio * ratio, FLT_EPSILON + posW + current(j, 3)); + + deltaX += scale * dx; + deltaY += scale * dy; + deltaZ += scale * dz; + deltaW += ratio; + } + } + } + + const float stiffness = gFrameData.mSelfCollisionStiffness * posW; + float scale = __fdividef(stiffness, deltaW); + + // apply collision impulse + xPtr[index] += deltaX * scale; + yPtr[index] += deltaY * scale; + zPtr[index] += deltaZ * scale; + + assert(!isnan(xPtr[index] + yPtr[index] + zPtr[index])); + } + __syncthreads(); + + // copy temporary particle array back to shared mem + // (need to copy whole array) + for(int32_t i = threadIdx.x; i < numParticles; i += blockDim.x) + { + current(i, 0) = xPtr[i]; + current(i, 1) = yPtr[i]; + current(i, 2) = zPtr[i]; + current(i, 3) = wPtr[i]; + } + + // unmark occupied cells to empty again (faster than clearing all the cells) + for(int32_t i = threadIdx.x; i < numIndices; i += blockDim.x) + { + int32_t key = sortedKeys[i] >> 16; + cellStart[key] = 0xffff; + cellStart[key + 1] = 0xffff; + } + __syncthreads(); +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp new file mode 100644 index 00000000..68238664 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.cpp @@ -0,0 +1,556 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxProfiler.h" +#include "CuSolver.h" +#include "CuCloth.h" +#include "ClothImpl.h" +#include "CuFabric.h" +#include "CuFactory.h" +#include "CuSolverKernel.h" +#include "CuContextLock.h" +#include "CuCheckSuccess.h" +#include "IterationState.h" +#include "CudaKernelWrangler.h" +#include "PsUtilities.h" +#include "PsSort.h" +#include "PsFoundation.h" + +#if PX_NVTX +#include "nvToolsExt.h" +#endif + +//#define ENABLE_CUDA_PRINTF PX_DEBUG // warning: not thread safe +#define ENABLE_CUDA_PRINTF 0 + +#if ENABLE_CUDA_PRINTF +extern "C" cudaError_t cudaPrintfInit(CUmodule hmod, size_t bufferLen = 1048576); +extern "C" void cudaPrintfEnd(); +extern "C" cudaError_t cudaPrintfDisplay(CUmodule hmod, void* outputFP = NULL, bool showThreadID = false); +#endif + +using namespace physx; + +namespace +{ +//for KernelWrangler interface +const char* gKernelName = cloth::getKernelFunctionName(); +} + +namespace +{ +template <typename T> +struct CuDeviceAllocator +{ + CuDeviceAllocator(physx::PxCudaContextManager* ctx) : mManager(ctx->getMemoryManager()) + { + } + + T* allocate(size_t n) + { + return reinterpret_cast<T*>(mManager->alloc(physx::PxCudaBufferMemorySpace::T_GPU, n * sizeof(T))); + } + + void deallocate(T* ptr) + { + mManager->free(physx::PxCudaBufferMemorySpace::T_GPU, reinterpret_cast<physx::PxCudaBufferPtr>(ptr)); + } + + physx::PxCudaMemoryManager* mManager; +}; +} + +cloth::CuSolver::CuSolver(CuFactory& factory) +: CuContextLock(factory) +, mFactory(factory) +, mClothData(mFactory.mContextManager) +, mClothDataHostCopy(CuHostAllocator(mFactory.mContextManager, cudaHostAllocWriteCombined)) +, mClothDataDirty(false) +, mFrameData(getMappedAllocator<CuFrameData>(mFactory.mContextManager)) +, mIterationData(getMappedAllocator<CuIterationData>(mFactory.mContextManager)) +, mIterationDataBegin(0) +, mFrameDt(0.0f) +, mSharedMemorySize(0) +, mSharedMemoryLimit(0) +, mStartSimulationTask(&CuSolver::beginFrame, "cloth.CuSolver.startSimulation") +, mKernelSimulationTask(&CuSolver::executeKernel, "cloth.CuSolver.kernelSimulation") +, mEndSimulationTask(&CuSolver::endFrame, "cloth.CuSolver.endSimulation") +, mStream(0) +, mKernelModule(0) +, mKernelFunction(0) +, mKernelSharedMemorySize(0) +, mClothIndex(CuDeviceAllocator<uint32_t>(mFactory.mContextManager).allocate(1)) +, mInterCollisionDistance(0.0f) +, mInterCollisionStiffness(1.0f) +, mInterCollisionIterations(1) +, mInterCollisionScratchMem(NULL) +, mInterCollisionScratchMemSize(0) +, mKernelWrangler(getDispatcher(), physx::shdfnd::getFoundation().getErrorCallback(), &gKernelName, 1) +, mSimulateNvtxRangeId(0) +, mCudaError(mKernelWrangler.hadError()) +{ + if(mCudaError) + { + CuContextLock::release(); + return; + } + + mStartSimulationTask.mSolver = this; + mKernelSimulationTask.mSolver = this; + mEndSimulationTask.mSolver = this; + + if(mFactory.mContextManager->getUsingConcurrentStreams()) + checkSuccess(cuStreamCreate(&mStream, 0)); + + if(1) + { + mKernelModule = mKernelWrangler.getCuModule(0); + mKernelFunction = mKernelWrangler.getCuFunction(0); + } + else + { + // load from ptx instead of embedded SASS, for iterating without recompile + checkSuccess(cuModuleLoad(&mKernelModule, "CuSolverKernel.ptx")); + checkSuccess(cuModuleGetFunction(&mKernelFunction, mKernelModule, getKernelFunctionName())); + shdfnd::getFoundation().error(PX_INFO, "Cloth kernel code loaded from CuSolverKernel.ptx"); + } + + // get amount of statically allocated shared memory + checkSuccess(cuFuncGetAttribute(&mKernelSharedMemorySize, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, mKernelFunction)); + + // extract CuKernelData device pointer + size_t size = 0; + CUdeviceptr ptr = 0; + checkSuccess(cuModuleGetGlobal(&ptr, &size, mKernelModule, getKernelDataName())); + mKernelData = CuDevicePointer<CuKernelData>(reinterpret_cast<CuKernelData*>(ptr)); + + // initialize cloth index + checkSuccess(cuMemsetD32(mClothIndex.dev(), 0, 1)); + + CuContextLock::release(); +} + +cloth::CuSolver::~CuSolver() +{ + PX_ASSERT(mCloths.empty()); + + CuContextLock::acquire(); + + CuKernelData kernelData = {}; + *mKernelData = kernelData; + + CuDeviceAllocator<uint32_t>(mFactory.mContextManager).deallocate(mClothIndex.get()); + + if(mStream) + checkSuccess(cuStreamDestroy(mStream)); + + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); +} + +void cloth::CuSolver::updateKernelData() +{ + CuKernelData kernelData; + + kernelData.mClothIndex = mClothIndex.get(); + kernelData.mClothData = mClothData.begin().get(); + kernelData.mFrameData = getDevicePointer(mFrameData); + + *mKernelData = kernelData; +} + +physx::PxGpuDispatcher& cloth::CuSolver::getDispatcher() const +{ + return *mFactory.mContextManager->getGpuDispatcher(); +} + +namespace +{ +struct ClothSimCostGreater +{ + bool operator()(const cloth::CuCloth* left, const cloth::CuCloth* right) const + { + return left->mNumParticles * left->mSolverFrequency > right->mNumParticles * right->mSolverFrequency; + } +}; +} + +void cloth::CuSolver::addCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + PX_ASSERT(mCloths.find(&cuCloth) == mCloths.end()); + + mCloths.pushBack(&cuCloth); + // trigger update of mClothData array + cuCloth.notifyChanged(); + + // sort cloth instances by size + shdfnd::sort(mCloths.begin(), mCloths.size(), ClothSimCostGreater()); + + CuContextLock contextLock(mFactory); + + // resize containers and update kernel data + mClothDataHostCopy.resize(mCloths.size()); + mClothData.resize(mCloths.size()); + mFrameData.resize(mCloths.size()); + updateKernelData(); +} + +void cloth::CuSolver::removeCloth(Cloth* cloth) +{ + CuCloth& cuCloth = static_cast<CuClothImpl&>(*cloth).mCloth; + + ClothVector::Iterator begin = mCloths.begin(), end = mCloths.end(); + ClothVector::Iterator it = mCloths.find(&cuCloth); + + if(it == end) + return; // not found + + uint32_t index = uint32_t(it - begin); + + mCloths.remove(index); + mClothDataHostCopy.remove(index); + mClothData.resize(mCloths.size()); + mClothDataDirty = true; +} + +physx::PxBaseTask& cloth::CuSolver::simulate(float dt, physx::PxBaseTask& continuation) +{ + mFrameDt = dt; + + if(mCloths.empty() || mCudaError) + { + continuation.addReference(); + return continuation; + } + + physx::PxGpuDispatcher& disp = getDispatcher(); + mEndSimulationTask.setContinuation(&continuation); + disp.addPostLaunchDependent(mEndSimulationTask); + mKernelSimulationTask.setContinuation(&disp.getPostLaunchTask()); + disp.getPostLaunchTask().removeReference(); + disp.addPreLaunchDependent(mKernelSimulationTask); + mStartSimulationTask.setContinuation(&disp.getPreLaunchTask()); + disp.getPreLaunchTask().removeReference(); + + mEndSimulationTask.removeReference(); + mKernelSimulationTask.removeReference(); + + return mStartSimulationTask; +} + +void cloth::CuSolver::beginFrame() +{ + CuContextLock contextLock(mFactory); + + PX_PROFILE_START_CROSSTHREAD("cloth.CuSolver.simulate", 0); + + CuIterationData* iterationDataBegin = mIterationData.empty() ? 0 : &mIterationData.front(); + + mFrameData.resize(0); + mIterationData.resize(0); + + // update cloth data + ClothVector::Iterator cIt, cEnd = mCloths.end(); + CuPinnedVector<CuClothData>::Type::Iterator dIt = mClothDataHostCopy.begin(); + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++dIt) + mClothDataDirty |= (*cIt)->updateClothData(*dIt); + + if(mClothDataDirty) + { + /* find optimal number of cloths per SM */ + + // at least 192 threads per block (e.g. CuCollision::buildAcceleration) + uint32_t numSMs = (uint32_t)mFactory.mContextManager->getMultiprocessorCount(); + uint32_t maxClothsPerSM = PxMin(mFactory.mMaxThreadsPerBlock / 192, (mCloths.size() + numSMs - 1) / numSMs); + + // tuning parameters: relative performance per numSharedPositions + float weights[3] = { 0.4f, 0.8f, 1.0f }; + + // try all possible number of cloths per SM and estimate performance + float maxWeightSum = 0.0f; + uint32_t numClothsPerSM = 0; + for(uint32_t i = 1; i <= maxClothsPerSM; ++i) + { + uint32_t sharedMemoryLimit = (mFactory.mContextManager->getSharedMemPerBlock() / i) - mKernelSharedMemorySize; + + float weightSum = 0.0f; + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + uint32_t sharedMemorySize = (*cIt)->mSharedMemorySize; + uint32_t positionsSize = (*cIt)->mNumParticles * sizeof(PxVec4); + + if(sharedMemorySize > sharedMemoryLimit) + break; + + uint32_t numSharedPositions = PxMin(2u, (sharedMemoryLimit - sharedMemorySize) / positionsSize); + + weightSum += weights[numSharedPositions] * positionsSize; + } + // tuning parameter: inverse performance for running i cloths per SM + weightSum *= 2.0f + i; + + if(cIt == cEnd && weightSum > maxWeightSum) + { + maxWeightSum = weightSum; + numClothsPerSM = i; + } + } + PX_ASSERT(numClothsPerSM); + + // update block size + uint32_t numThreadsPerBlock = mFactory.mMaxThreadsPerBlock / numClothsPerSM & ~31; + + // Workaround for nvbug 1709919: theoretically, register usage should allow us to launch at least + // mFactory.mMaxThreadsPerBlock threads, because that value corresponds to __launch_bounds__(maxThreadsPerBlock). + CUdevice device = 0; + checkSuccess(cuCtxGetDevice(&device)); + int registersPerBlock = 0, kernelRegisterCount = 0; + checkSuccess(cuDeviceGetAttribute(®istersPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device)); + checkSuccess(cuFuncGetAttribute(&kernelRegisterCount, CU_FUNC_ATTRIBUTE_NUM_REGS, mKernelFunction)); + numThreadsPerBlock = PxMin(numThreadsPerBlock, uint32_t(registersPerBlock / kernelRegisterCount)); + PX_ASSERT(numThreadsPerBlock >= 192); + + if(mFactory.mNumThreadsPerBlock != numThreadsPerBlock) + { + checkSuccess( + cuFuncSetBlockShape(mKernelFunction, int(mFactory.mNumThreadsPerBlock = numThreadsPerBlock), 1, 1)); + } + + // remember num cloths per SM in terms of max shared memory per block + mSharedMemoryLimit = + (mFactory.mContextManager->getSharedMemPerBlock() / numClothsPerSM) - mKernelSharedMemorySize; + } + + uint32_t maxSharedMemorySize = 0; + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt) + { + CuCloth& cloth = **cIt; + + uint32_t sharedMemorySize = cloth.mSharedMemorySize; + uint32_t positionsSize = cloth.mNumParticles * sizeof(PxVec4); + + uint32_t numSharedPositions = PxMin(2u, (mSharedMemoryLimit - sharedMemorySize) / positionsSize); + + maxSharedMemorySize = PxMax(maxSharedMemorySize, sharedMemorySize + numSharedPositions * positionsSize); + + IterationStateFactory factory(cloth, mFrameDt); + IterationState<Simd4f> state = factory.create<Simd4f>(cloth); + + mFrameData.pushBack(CuFrameData(cloth, numSharedPositions, state, mIterationDataBegin + mIterationData.size())); + + while(state.mRemainingIterations) + { + mIterationData.pushBack(CuIterationData(state)); + state.update(); + } + } + mSharedMemorySize = maxSharedMemorySize; + + // add dummy element because we read past the end + mIterationData.pushBack(CuIterationData()); + + if(&mIterationData.front() != iterationDataBegin) + { + // mIterationData grew, update pointers + iterationDataBegin = getDevicePointer(mIterationData); + + ptrdiff_t diff = (char*)iterationDataBegin - (char*)mIterationDataBegin; + CuPinnedVector<CuFrameData>::Type::Iterator fIt = mFrameData.begin(), fEnd; + for(fEnd = mFrameData.end(); fIt != fEnd; ++fIt) + reinterpret_cast<const char*&>(fIt->mIterationData) += diff; + + mIterationDataBegin = iterationDataBegin; + } +} + +void cloth::CuSolver::executeKernel() +{ + CuContextLock contextLock(mFactory); + +#if ENABLE_CUDA_PRINTF + if(cudaError result = cudaPrintfInit(mKernelModule)) + { + shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__, "cudaPrintfInit() returned %u.", + result); + } +#endif + + if(mClothDataDirty) + { + PX_ASSERT(mClothDataHostCopy.size() == mClothData.size()); + size_t numBytes = mClothData.size() * sizeof(CuClothData); + checkSuccess(cuMemcpyHtoDAsync(mClothData.begin().dev(), mClothDataHostCopy.begin(), numBytes, mStream)); + mClothDataDirty = false; + } + +#if 0 + static int frame = 0; + if(++frame == 100) + record(*this); +#endif + + // launch kernel + CUresult result = cuLaunchKernel(mKernelFunction, mCloths.size(), 1, 1, mFactory.mNumThreadsPerBlock, 1, 1, + mSharedMemorySize, mStream, 0, 0); + +#if ENABLE_CUDA_PRINTF + cudaPrintfDisplay(mKernelModule); + cudaPrintfEnd(); +#endif + +#if PX_DEBUG + // in debug builds check kernel result + checkSuccess(result); + checkSuccess(cuStreamSynchronize(mStream)); +#endif + + // mark the solver as being in an error state + // all cloth instances will be migrated to software + if(result != CUDA_SUCCESS) + mCudaError = true; +} + +void cloth::CuSolver::endFrame() +{ + CuPinnedVector<CuFrameData>::Type::ConstIterator fIt = mFrameData.begin(); + ClothVector::Iterator cIt, cEnd = mCloths.end(); + for(cIt = mCloths.begin(); cIt != cEnd; ++cIt, ++fIt) + { + CuCloth& cloth = **cIt; + + cloth.mHostParticlesDirty = false; + cloth.mDeviceParticlesDirty = false; + + cloth.mMotionConstraints.pop(); + cloth.mMotionConstraints.mHostCopy.resize(0); + + cloth.mSeparationConstraints.pop(); + cloth.mSeparationConstraints.mHostCopy.resize(0); + + if(!cloth.mTargetCollisionSpheres.empty()) + { + shdfnd::swap(cloth.mStartCollisionSpheres, cloth.mTargetCollisionSpheres); + cloth.mTargetCollisionSpheres.resize(0); + } + + if(!cloth.mTargetCollisionPlanes.empty()) + { + shdfnd::swap(cloth.mStartCollisionPlanes, cloth.mTargetCollisionPlanes); + cloth.mTargetCollisionPlanes.resize(0); + } + + if(!cloth.mTargetCollisionTriangles.empty()) + { + shdfnd::swap(cloth.mStartCollisionTriangles, cloth.mTargetCollisionTriangles); + cloth.mTargetCollisionTriangles.resize(0); + } + + for(uint32_t i = 0; i < 3; ++i) + { + float upper = fIt->mParticleBounds[i * 2 + 0]; + float negativeLower = fIt->mParticleBounds[i * 2 + 1]; + cloth.mParticleBoundsCenter[i] = (upper - negativeLower) * 0.5f; + cloth.mParticleBoundsHalfExtent[i] = (upper + negativeLower) * 0.5f; + } + + cloth.mSleepPassCounter = fIt->mSleepPassCounter; + cloth.mSleepTestCounter = fIt->mSleepTestCounter; + } + + interCollision(); + + PX_PROFILE_STOP_CROSSTHREAD("cloth::CuSolver::simulate", 0); +} + +void cloth::CuSolver::interCollision() +{ + if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f) + return; + + typedef SwInterCollision<Simd4f> SwInterCollision; + + // rebuild cloth instance array + mInterCollisionInstances.resize(0); + for(uint32_t i = 0, n = mCloths.size(); i < n; ++i) + { + CuCloth& cloth = *mCloths[i]; + + float elasticity = 1.0f / mFrameData[i].mNumIterations; + PX_ASSERT(!cloth.mHostParticlesDirty); + PxVec4* particles = cloth.mParticlesHostCopy.begin(); + uint32_t* indices = NULL, numIndices = cloth.mNumParticles; + if(!cloth.mSelfCollisionIndices.empty()) + { + indices = cloth.mSelfCollisionIndicesHost.begin(); + numIndices = uint32_t(cloth.mSelfCollisionIndices.size()); + } + + mInterCollisionInstances.pushBack(SwInterCollisionData( + particles, particles + cloth.mNumParticles, numIndices, indices, cloth.mTargetMotion, + cloth.mParticleBoundsCenter, cloth.mParticleBoundsHalfExtent, elasticity, cloth.mUserData)); + + cloth.mDeviceParticlesDirty = true; + } + + uint32_t requiredTempMemorySize = uint32_t( + SwInterCollision::estimateTemporaryMemory(&mInterCollisionInstances[0], mInterCollisionInstances.size())); + + // realloc temp memory if necessary + if(mInterCollisionScratchMemSize < requiredTempMemorySize) + { + if(mInterCollisionScratchMem) + PX_FREE(mInterCollisionScratchMem); + + mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem"); + mInterCollisionScratchMemSize = requiredTempMemorySize; + } + + SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize); + + // run inter-collision + SwInterCollision(mInterCollisionInstances.begin(), mInterCollisionInstances.size(), mInterCollisionDistance, + mInterCollisionStiffness, mInterCollisionIterations, mInterCollisionFilter, allocator)(); +} + +cloth::CuSolver::ClothSolverTask::ClothSolverTask(FunctionPtr functionPtr, const char* name) +: mSolver(0), mFunctionPtr(functionPtr), mName(name) +{ +} + +void cloth::CuSolver::ClothSolverTask::runInternal() +{ + (mSolver->*mFunctionPtr)(); +} + +const char* cloth::CuSolver::ClothSolverTask::getName() const +{ + return mName; +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h new file mode 100644 index 00000000..ff98d975 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolver.h @@ -0,0 +1,180 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Solver.h" +#include "CuClothData.h" +#include "CuPinnedAllocator.h" +#include "CuContextLock.h" +#include "CuDeviceVector.h" +#include "CudaKernelWrangler.h" +#include "CmTask.h" + +#include "SwInterCollision.h" + +namespace physx +{ + +namespace cloth +{ + +class CuCloth; +class CuFabric; +struct PhaseConfig; +struct CuKernelData; + +class CuSolver : public UserAllocated, private CuContextLock, public Solver +{ +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4371) // layout of class may have changed from a previous version of the compiler due to + // better packing of member +#endif + struct ClothSolverTask : public Cm::Task + { + typedef void (CuSolver::*FunctionPtr)(); + + ClothSolverTask(FunctionPtr, const char*); + virtual void runInternal(); + virtual const char* getName() const; + + CuSolver* mSolver; + FunctionPtr mFunctionPtr; + const char* mName; + }; +#if PX_VC +#pragma warning(pop) +#endif + + PX_NOCOPY(CuSolver) + public: + CuSolver(CuFactory&); + ~CuSolver(); + + virtual void addCloth(Cloth*); + virtual void removeCloth(Cloth*); + + virtual physx::PxBaseTask& simulate(float dt, physx::PxBaseTask&); + + virtual bool hasError() const + { + return mCudaError; + } + + virtual void setInterCollisionDistance(float distance) + { + mInterCollisionDistance = distance; + } + virtual float getInterCollisionDistance() const + { + return mInterCollisionDistance; + } + virtual void setInterCollisionStiffness(float stiffness) + { + mInterCollisionStiffness = stiffness; + } + virtual float getInterCollisionStiffness() const + { + return mInterCollisionStiffness; + } + virtual void setInterCollisionNbIterations(uint32_t nbIterations) + { + mInterCollisionIterations = nbIterations; + } + virtual uint32_t getInterCollisionNbIterations() const + { + return mInterCollisionIterations; + } + virtual void setInterCollisionFilter(InterCollisionFilter filter) + { + mInterCollisionFilter = filter; + } + + private: + void updateKernelData(); // context needs to be acquired + + // simulate helper functions + void beginFrame(); + void executeKernel(); + void endFrame(); + + void interCollision(); + + physx::PxGpuDispatcher& getDispatcher() const; + + private: + CuFactory& mFactory; + + typedef Vector<CuCloth*>::Type ClothVector; + ClothVector mCloths; + + CuDeviceVector<CuClothData> mClothData; + CuPinnedVector<CuClothData>::Type mClothDataHostCopy; + bool mClothDataDirty; + + CuPinnedVector<CuFrameData>::Type mFrameData; + + CuPinnedVector<CuIterationData>::Type mIterationData; + CuIterationData* mIterationDataBegin; // corresponding device ptr + + float mFrameDt; + + uint32_t mSharedMemorySize; + uint32_t mSharedMemoryLimit; + + ClothSolverTask mStartSimulationTask; + ClothSolverTask mKernelSimulationTask; + ClothSolverTask mEndSimulationTask; + + CUstream mStream; + CUmodule mKernelModule; + CUfunction mKernelFunction; + int mKernelSharedMemorySize; + CuDevicePointer<CuKernelData> mKernelData; + CuDevicePointer<uint32_t> mClothIndex; + + float mInterCollisionDistance; + float mInterCollisionStiffness; + uint32_t mInterCollisionIterations; + InterCollisionFilter mInterCollisionFilter; + void* mInterCollisionScratchMem; + uint32_t mInterCollisionScratchMemSize; + shdfnd::Array<SwInterCollisionData> mInterCollisionInstances; + + physx::KernelWrangler mKernelWrangler; + + uint64_t mSimulateNvtxRangeId; + + bool mCudaError; + + friend void record(const CuSolver&); +}; +} +} diff --git a/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h new file mode 100644 index 00000000..d6ca350f --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/windows/CuSolverKernel.h @@ -0,0 +1,57 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include "Types.h" + +namespace physx +{ +namespace cloth +{ +struct CuClothData; +struct CuFrameData; + +// data of all cloth instances, one block per instance +struct CuKernelData +{ + // pointer to atomic variable + uint32_t* mClothIndex; + + // array of cloths (length determined by grid dim) + const CuClothData* mClothData; + + // frame data per cloth + CuFrameData* mFrameData; +}; + +const char* getKernelDataName(); +const char* getKernelFunctionName(); +} +} |