aboutsummaryrefslogtreecommitdiff
path: root/NvCloth/src/TripletScheduler.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'NvCloth/src/TripletScheduler.cpp')
-rw-r--r--NvCloth/src/TripletScheduler.cpp353
1 files changed, 279 insertions, 74 deletions
diff --git a/NvCloth/src/TripletScheduler.cpp b/NvCloth/src/TripletScheduler.cpp
index 999be0e..0116200 100644
--- a/NvCloth/src/TripletScheduler.cpp
+++ b/NvCloth/src/TripletScheduler.cpp
@@ -39,45 +39,132 @@ cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets)
{
}
-// SSE version
+// helper functions for TripletScheduler::simd()
+namespace
+{
+ //linear search should be fine for small particlesInBatchSize values
+ bool isIndexInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+ {
+ for(int i = 0; i < particlesInBatchSize; i++)
+ {
+ if(particlesInBatch[i] == 0xffffffff)
+ return false;
+ if(index1 == particlesInBatch[i] || index2 == particlesInBatch[i] || index3 == particlesInBatch[i])
+ return true;
+ }
+ return false;
+ }
+
+ void markIndicesInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+ {
+ for(int i = 0; i < particlesInBatchSize - 2; i++)
+ {
+ if(particlesInBatch[i] == 0xffffffff)
+ {
+ NV_CLOTH_ASSERT(i + 2 < particlesInBatchSize);
+ particlesInBatch[i] = index1;
+ particlesInBatch[i + 1] = index2;
+ particlesInBatch[i + 2] = index3;
+ break;
+ }
+ }
+ }
+}
+
+/*
+Group triplets in batches of simdWith so that they can be processed in parallel without referencing the same particle.
+Not suitable for simdWidth with large values due to linear search.
+Note that sets can be larger that simdWidth, and that particles may be referenced from the same set multiple times,
+ as long as they are not referenced more than once within a batch of simdWidth. The batch resets at the start of
+ each set.
+*/
void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth)
{
- if (mTriplets.empty())
+ PX_UNUSED(numParticles);
+ if(mTriplets.empty())
return;
- Vector<uint32_t>::Type mark(numParticles, uint32_t(-1));
+ const int particlesInBatchSize = simdWidth * 3;
+ uint32_t* particlesInBatch = new uint32_t[particlesInBatchSize]; //used to mark used indices in current batch
+ int setSize = 0; //number of triplets in current set
+ int amountOfPaddingNeeded = 0;
- uint32_t setIndex = 0, setSize = 0;
- for (TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex)
+ for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; tIt++)
{
- TripletIter tLast = tIt + std::min(simdWidth, uint32_t(tEnd - tIt));
- TripletIter tSwap = tEnd;
-
- for (; tIt != tLast && tIt != tSwap; ++tIt, ++setSize)
+ if(setSize%simdWidth == 0)
{
- // swap from tail until independent triplet found
- while ((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap)
- std::iter_swap(tIt, tSwap);
+ //reset particlesInBatch if we filled the simd width
+ memset(particlesInBatch, 0xff, sizeof(uint32_t)*particlesInBatchSize);
+ }
- if (tIt == tSwap)
- break; // no independent triplet found
+ TripletIter tSwap = tIt;
- // mark vertices to be used in simdIndex
- mark[tIt->x] = setIndex;
- mark[tIt->y] = setIndex;
- mark[tIt->z] = setIndex;
+ //Look for the next triplet that doesn't share indices with current batch
+ while(isIndexInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z))
+ {
+ tSwap++;
+ if(tSwap == tEnd)
+ break;
}
- if (tIt == tSwap) // remaining triplets depend on current set
+ if(tSwap == tEnd)
{
- if (setSize > simdWidth) // trim set to multiple of simdWidth
- {
- uint32_t overflow = setSize % simdWidth;
- setSize -= overflow;
- tIt -= overflow;
- }
+ //all remaining triplets are share indices with the current batch
+
+ //This doesn't make sense, as it will just introduce an extra set with setSize<simdWidth
+ //Should double check though
+ // // trim set to multiple of simdWidth with the hope that the trimmed triples
+ // // will fit in the next set with a multiple of simdWidth size
+ // if(setSize > (int)simdWidth)
+ // {
+ // uint32_t overflow = setSize % simdWidth;
+ // setSize -= overflow;
+ // tIt -= overflow;
+ // }
+
+ //finish set
mSetSizes.pushBack(setSize);
+ amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth); //last modulo avoids adding padding when setSize%simdWidth==0
setSize = 0;
+ tIt--; //start next set with current triplet
+ }
+ else
+ {
+ //add triplet to set
+ markIndicesInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z);
+ std::iter_swap(tIt, tSwap); //Place triplet at the end of the current set, so that they in sequence in mTriplets
+ setSize++;
+ }
+ }
+ if(setSize)
+ {
+ //finish last set, if we have one
+ mSetSizes.pushBack(setSize);
+ amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth)))% static_cast<int>(simdWidth);
+ }
+
+ // Padding code used to live in SwCloth::setVirtualParticles, now we do it here directly
+ mPaddedTriplets.reserve(mTriplets.size() + amountOfPaddingNeeded);
+
+ {
+ Vec4us paddingDummy(static_cast<uint16_t>(numParticles), static_cast<uint16_t>(numParticles) + 1, static_cast<uint16_t>(numParticles) + 2, 0);
+
+ TripletIter tIt = mTriplets.begin();
+ //TripletIter tEnd = mTriplets.end();
+ SetIter setIt = mSetSizes.begin();
+ SetIter setEnd = mSetSizes.end();
+ for(; setIt < setEnd; ++setIt)
+ {
+ for(unsigned int i = 0; i < *setIt; i++)
+ {
+ mPaddedTriplets.pushBack(*tIt);
+ ++tIt;
+ }
+ unsigned int padding = (static_cast<int>(simdWidth) - (*setIt % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth);
+ for(unsigned int i = 0; i < padding; i++)
+ {
+ mPaddedTriplets.pushBack(paddingDummy);
+ }
}
}
}
@@ -95,8 +182,10 @@ struct TripletSet
}
uint32_t mMark; // triplet index
- uint8_t mNumReplays[3];
- uint8_t mNumConflicts[3][32];
+
+ // [3] because each particle is fetched on a different instruction.
+ uint8_t mNumReplays[3]; //how many times the instruction needs to be executed to resolve all bank conflicts (= maxElement(mNumConflicts[3][0...31]))
+ uint8_t mNumConflicts[3][32]; //Number of accesses for each of the 32 banks (if it is 1 then there is no conflict on that bank)
};
/*
@@ -132,83 +221,147 @@ void prefixSum(Iter first, Iter last, Iter dest)
*dest = *(dest - 1) + *first;
}
}
-}
-// CUDA version
-void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+class AdjacencyQuerier
{
- // NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+private:
+ typedef nv::cloth::Vector<uint32_t>::Type UintVector;
+ typedef nv::cloth::Vector<uint32_t>::Type::Iterator UintVectorIterator;
+ typedef nv::cloth::Vector<nv::cloth::Vec4u>::Type::ConstIterator ConstTripletIter;
+ UintVector mAdjacencyIndecies;
+ UintVector mAdjacencies;
+ uint32_t mMaxAdjacentCount;
+
+public:
+ AdjacencyQuerier(nv::cloth::Vector<nv::cloth::Vec4u>::Type const& triplets, int particleCount)
+ {
+ {
+ UintVector& adjacencyCount = mAdjacencyIndecies; //use same memory as mAdjacencyIndecies
+ adjacencyCount.resize(particleCount+1, uint32_t(0)); //initialize count to 0. Size+1 used later for prefixSum/indices
- if (mTriplets.empty())
- return;
+ // count the number of triplets per particle
+ for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt)
+ {
+ for(int i = 0; i < 3; i++) //for each index in the triplet
+ {
+ const uint32_t particleIndex = (*tIt)[i];
+ adjacencyCount[particleIndex] += 1;
+ }
+ }
+ //adjacentcyCount[i] now holds the number of triplets referring to particle i
- TripletIter tIt, tEnd = mTriplets.end();
- uint32_t tripletIndex;
+ mMaxAdjacentCount = *physx::shdfnd::maxElement(adjacencyCount.begin(), adjacencyCount.end());
- // count number of triplets per particle
- Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0));
- for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt)
- for (int i = 0; i < 3; ++i)
- ++adjacentCount[(*tIt)[i]];
+ // compute in place prefix sum (inclusive)
+ prefixSum(adjacencyCount.begin(), adjacencyCount.end(), mAdjacencyIndecies.begin());
+ // we use mAdjacencyIndecies from now on
+ }
- /* neither of those were really improving number of batches:
- // run simd version to pre-sort particles
- simd(numParticles, blockWidth); mSetSizes.resize(0);
- // sort according to triplet degree (estimated by sum of adjacentCount)
- std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount));
- */
+ mAdjacencies.resize(mAdjacencyIndecies.back());
+ uint32_t tripletIndex = 0;
+ for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt, ++tripletIndex)
+ {
+ for(int i = 0; i < 3; i++) //for each index in the triplet
+ {
+ const uint32_t particleIndex = (*tIt)[i];
- uint32_t maxTripletCount = *shdfnd::maxElement(adjacentCount.begin(), adjacentCount.end());
+ //Decrement mAdjacencyIndecies here to convert it from inclusive to exclusive prefix sum
+ const uint32_t adjacencyIndex = --mAdjacencyIndecies[particleIndex];
- // compute in place prefix sum (inclusive)
- prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin());
+ mAdjacencies[adjacencyIndex] = tripletIndex;
+ }
+ }
+ //mAdjacencies[mAdjacencyIndecies[i]] now stores the first triplet index referring to particle i
+ //mAdjacencies[mAdjacencyIndecies[i+1]] stores one past the last
+ }
- // initialize adjacencies (for each particle, collect touching triplets)
- // also converts partial sum in adjacentCount from inclusive to exclusive
- Vector<uint32_t>::Type adjacencies(adjacentCount.back());
- for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
- for (int i = 0; i < 3; ++i)
- adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex;
+ uint32_t getMaxAdjacentCount() const
+ {
+ return mMaxAdjacentCount;
+ }
+
+ nv::cloth::Range<const uint32_t> getAdjacentTriplets(uint32_t particleIndex) const
+ {
+ //use .begin() + offset to get around bounds check
+ auto begin = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex]);
+ auto end = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex + 1]);
+ return nv::cloth::Range<const uint32_t>(begin, end);
+ }
+};
+
+}
+
+// CUDA version. Doesn't add padding, optimizes for bank conflicts
+void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+ // warpWidth has to be <= 32 and a power of two
+ NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+
+ if (mTriplets.empty())
+ return;
+
+ AdjacencyQuerier adjacencyQuerier(mTriplets, numParticles);
uint32_t warpMask = warpWidth - 1;
- uint32_t numSets = maxTripletCount; // start with minimum number of sets
+ uint32_t numSets = adjacencyQuerier.getMaxAdjacentCount(); // start with minimum number of sets
Vector<TripletSet>::Type sets(numSets);
+
+ // maps triplets to the set index that contains the triplet (setIndex = setIndices[tripletIndex])
Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1));
+
mSetSizes.resize(numSets);
+ uint32_t tripletIndex = 0;
+ TripletIter tIt, tEnd = mTriplets.end();
+
// color triplets (assign to sets)
- Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd;
- for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+ for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt, ++tripletIndex)
{
- // mark sets of adjacent triplets
+ // mark sets with adjacent triplets
for (int i = 0; i < 3; ++i)
{
uint32_t particleIndex = (*tIt)[i];
- aIt = aBegin + adjacentCount[particleIndex];
- aEnd = aBegin + adjacentCount[particleIndex + 1];
- for (uint32_t setIndex; aIt != aEnd; ++aIt)
- if (numSets > (setIndex = setIndices[*aIt]))
+ Range<const uint32_t> adjacentTriplets = adjacencyQuerier.getAdjacentTriplets(particleIndex);
+
+ //for all triplets adjacent to (sharing) particle 'particleIndex'
+ for (; adjacentTriplets.begin() != adjacentTriplets.end(); adjacentTriplets.popFront())
+ {
+ uint32_t setIndex;
+ if(numSets > (setIndex = setIndices[adjacentTriplets.front()]))
+ {
+ //the set 'setIndex' has an adjacent triplet in it (sharing at least one vertex with tripletIndex)
sets[setIndex].mMark = tripletIndex;
+ }
+ }
}
- // find valid set with smallest number of bank conflicts
+ // find valid set with smallest number of additional replays when adding triplet tIt
uint32_t bestIndex = numSets;
- uint32_t minReplays = 4;
- for (uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex)
+ uint32_t minAdditionalReplays = 4; // one more than the maximum possible (for 3 particles)
+ for (uint32_t setIndex = 0; setIndex < numSets && minAdditionalReplays; ++setIndex)
{
const TripletSet& set = sets[setIndex];
if (set.mMark == tripletIndex)
- continue; // triplet collision
+ continue; // triplet collision (this set contains an adjacent triplet)
+ // count additional replays caused by inserting triplet 'tIt' to in set 'setIndex'
uint32_t numReplays = 0;
- for (uint32_t i = 0; i < 3; ++i)
- numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]];
+ for(uint32_t i = 0; i < 3; ++i) //for each particle in the triplet
+ {
+ const uint32_t particleIndex = (*tIt)[i];
+ const uint32_t bank = warpMask & particleIndex;
- if (minReplays > numReplays)
+ // an additional replay will only be caused if the additional bank conflict is
+ // on a bank that is the current bottleneck
+ numReplays += set.mNumReplays[i] == set.mNumConflicts[i][bank];
+ }
+
+ // hold on to this set if it has less additional replays compared to our current best set
+ if (minAdditionalReplays > numReplays)
{
- minReplays = numReplays;
+ minAdditionalReplays = numReplays;
bestIndex = setIndex;
}
}
@@ -221,18 +374,26 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
++numSets;
}
- // increment bank conflicts or reset if warp filled
+ // increment mNumReplays or reset if warp filled
TripletSet& set = sets[bestIndex];
if (++mSetSizes[bestIndex] & warpMask)
+ {
for (uint32_t i = 0; i < 3; ++i)
- set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]);
+ {
+ const uint32_t particleIndex = (*tIt)[i];
+ const uint32_t bank = warpMask & particleIndex;
+ set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][bank]);
+ }
+ }
else
+ {
set = TripletSet();
+ }
setIndices[tripletIndex] = bestIndex;
}
- // reorder triplets
+ // reorder triplets so that the sets are in continuous memory
Vector<uint32_t>::Type setOffsets(mSetSizes.size());
prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin());
@@ -243,3 +404,47 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
mTriplets.swap(triplets);
}
+
+
+/* Interface used for unit tests */
+cloth::TripletSchedulerTestInterface::TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets)
+ :
+ mScheduler(new cloth::TripletScheduler(triplets))
+{}
+
+cloth::TripletSchedulerTestInterface::~TripletSchedulerTestInterface()
+{
+ delete mScheduler;
+}
+void cloth::TripletSchedulerTestInterface::simd(uint32_t numParticles, uint32_t simdWidth)
+{
+ mScheduler->simd(numParticles, simdWidth);
+}
+void cloth::TripletSchedulerTestInterface::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+ mScheduler->warp(numParticles, warpWidth);
+}
+
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetTriplets()
+{
+ return Range<const uint32_t>(
+ reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin()),
+ reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin() + mScheduler->mTriplets.size())
+ );
+}
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetSetSizes()
+{
+ return Range<const uint32_t>(
+ mScheduler->mSetSizes.begin(),
+ mScheduler->mSetSizes.begin() + mScheduler->mSetSizes.size()
+ );
+}
+
+NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]> triplets)
+{
+ return new nv::cloth::TripletSchedulerTestInterface(triplets);
+}
+NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface* interface)
+{
+ delete interface;
+}