1 files changed, 279 insertions, 74 deletions
diff --git a/NvCloth/src/TripletScheduler.cpp b/NvCloth/src/TripletScheduler.cpp
index 999be0e..0116200 100644
--- a/NvCloth/src/TripletScheduler.cpp
+++ b/NvCloth/src/TripletScheduler.cpp
@@ -39,45 +39,132 @@ cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets)
 {
 }
 
-// SSE version
+// helper functions for TripletScheduler::simd()
+namespace
+{
+	//linear search should be fine for small particlesInBatchSize values
+	bool isIndexInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+	{
+		for(int i = 0; i < particlesInBatchSize; i++)
+		{
+			if(particlesInBatch[i] == 0xffffffff)
+				return false;
+			if(index1 == particlesInBatch[i] || index2 == particlesInBatch[i] || index3 == particlesInBatch[i])
+				return true;
+		}
+		return false;
+	}
+
+	void markIndicesInBatch(uint32_t* particlesInBatch, int particlesInBatchSize, uint32_t index1, uint32_t index2, uint32_t index3)
+	{
+		for(int i = 0; i < particlesInBatchSize - 2; i++)
+		{
+			if(particlesInBatch[i] == 0xffffffff)
+			{
+				NV_CLOTH_ASSERT(i + 2 < particlesInBatchSize);
+				particlesInBatch[i] = index1;
+				particlesInBatch[i + 1] = index2;
+				particlesInBatch[i + 2] = index3;
+				break;
+			}
+		}
+	}
+}
+
+/*
+Group triplets in batches of simdWith so that they can be processed in parallel without referencing the same particle.
+Not suitable for simdWidth with large values due to linear search.
+Note that sets can be larger that simdWidth, and that particles may be referenced from the same set multiple times, 
+	as long as they are not referenced more than once within a batch of simdWidth. The batch resets at the start of 
+	each set.
+*/
 void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth)
 {
-	if (mTriplets.empty())
+	PX_UNUSED(numParticles);
+	if(mTriplets.empty())
 		return;
 
-	Vector<uint32_t>::Type mark(numParticles, uint32_t(-1));
+	const int particlesInBatchSize = simdWidth * 3;
+	uint32_t* particlesInBatch = new uint32_t[particlesInBatchSize]; //used to mark used indices in current batch
+	int setSize = 0; //number of triplets in current set
+	int amountOfPaddingNeeded = 0;
 
-	uint32_t setIndex = 0, setSize = 0;
-	for (TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex)
+	for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; tIt++)
 	{
-		TripletIter tLast = tIt + std::min(simdWidth, uint32_t(tEnd - tIt));
-		TripletIter tSwap = tEnd;
-
-		for (; tIt != tLast && tIt != tSwap; ++tIt, ++setSize)
+		if(setSize%simdWidth == 0)
 		{
-			// swap from tail until independent triplet found
-			while ((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap)
-				std::iter_swap(tIt, tSwap);
+			//reset particlesInBatch if we filled the simd width
+			memset(particlesInBatch, 0xff, sizeof(uint32_t)*particlesInBatchSize);
+		}
 
-			if (tIt == tSwap)
-				break; // no independent triplet found
+		TripletIter tSwap = tIt;
 
-			// mark vertices to be used in simdIndex
-			mark[tIt->x] = setIndex;
-			mark[tIt->y] = setIndex;
-			mark[tIt->z] = setIndex;
+		//Look for the next triplet that doesn't share indices with current batch
+		while(isIndexInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z))
+		{
+			tSwap++;
+			if(tSwap == tEnd)
+				break;
 		}
 
-		if (tIt == tSwap) // remaining triplets depend on current set
+		if(tSwap == tEnd)
 		{
-			if (setSize > simdWidth) // trim set to multiple of simdWidth
-			{
-				uint32_t overflow = setSize % simdWidth;
-				setSize -= overflow;
-				tIt -= overflow;
-			}
+			//all remaining triplets are share indices with the current batch
+
+			//This doesn't make sense, as it will just introduce an extra set with setSize<simdWidth
+			//Should double check though
+			//  // trim set to multiple of simdWidth with the hope that the trimmed triples
+			//  //  will fit in the next set with a multiple of simdWidth size
+			//  if(setSize > (int)simdWidth)
+			//  {
+			//  	uint32_t overflow = setSize % simdWidth;
+			//  	setSize -= overflow;
+			//  	tIt -= overflow;
+			//  }
+
+			//finish set
 			mSetSizes.pushBack(setSize);
+			amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth); //last modulo avoids adding padding when setSize%simdWidth==0
 			setSize = 0;
+			tIt--; //start next set with current triplet
+		}
+		else
+		{
+			//add triplet to set
+			markIndicesInBatch(particlesInBatch, particlesInBatchSize, tSwap->x, tSwap->y, tSwap->z);
+			std::iter_swap(tIt, tSwap); //Place triplet at the end of the current set, so that they in sequence in mTriplets
+			setSize++;
+		}
+	}
+	if(setSize)
+	{
+		//finish last set, if we have one
+		mSetSizes.pushBack(setSize);
+		amountOfPaddingNeeded += (static_cast<int>(simdWidth) - (setSize % static_cast<int>(simdWidth)))% static_cast<int>(simdWidth);
+	}
+
+	// Padding code used to live in SwCloth::setVirtualParticles, now we do it here directly 
+	mPaddedTriplets.reserve(mTriplets.size() + amountOfPaddingNeeded);
+
+	{
+		Vec4us paddingDummy(static_cast<uint16_t>(numParticles), static_cast<uint16_t>(numParticles) + 1, static_cast<uint16_t>(numParticles) + 2, 0);
+
+		TripletIter tIt = mTriplets.begin();
+		//TripletIter tEnd = mTriplets.end();
+		SetIter setIt = mSetSizes.begin();
+		SetIter setEnd = mSetSizes.end();
+		for(; setIt < setEnd; ++setIt)
+		{
+			for(unsigned int i = 0; i < *setIt; i++)
+			{
+				mPaddedTriplets.pushBack(*tIt);
+				++tIt;
+			}
+			unsigned int padding = (static_cast<int>(simdWidth) - (*setIt % static_cast<int>(simdWidth))) % static_cast<int>(simdWidth);
+			for(unsigned int i = 0; i < padding; i++)
+			{
+				mPaddedTriplets.pushBack(paddingDummy);
+			}
 		}
 	}
 }
@@ -95,8 +182,10 @@ struct TripletSet
 	}
 
 	uint32_t mMark; // triplet index
-	uint8_t mNumReplays[3];
-	uint8_t mNumConflicts[3][32];
+
+	// [3] because each particle is fetched on a different instruction.
+	uint8_t mNumReplays[3]; //how many times the instruction needs to be executed to resolve all bank conflicts (= maxElement(mNumConflicts[3][0...31]))
+	uint8_t mNumConflicts[3][32]; //Number of accesses for each of the 32 banks (if it is 1 then there is no conflict on that bank)
 };
 
 /*
@@ -132,83 +221,147 @@ void prefixSum(Iter first, Iter last, Iter dest)
 			*dest = *(dest - 1) + *first;
 	}
 }
-}
 
-// CUDA version
-void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+class AdjacencyQuerier
 {
-	// NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+private:
+	typedef nv::cloth::Vector<uint32_t>::Type UintVector;
+	typedef nv::cloth::Vector<uint32_t>::Type::Iterator UintVectorIterator;
+	typedef nv::cloth::Vector<nv::cloth::Vec4u>::Type::ConstIterator ConstTripletIter;
+	UintVector mAdjacencyIndecies;
+	UintVector mAdjacencies;
+	uint32_t mMaxAdjacentCount;
+
+public:
+	AdjacencyQuerier(nv::cloth::Vector<nv::cloth::Vec4u>::Type const& triplets, int particleCount)
+	{
+		{
+			UintVector& adjacencyCount = mAdjacencyIndecies; //use same memory as mAdjacencyIndecies
+			adjacencyCount.resize(particleCount+1, uint32_t(0)); //initialize count to 0. Size+1 used later for prefixSum/indices
 
-	if (mTriplets.empty())
-		return;
+			// count the number of triplets per particle
+			for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt)
+			{
+				for(int i = 0; i < 3; i++) //for each index in the triplet
+				{
+					const uint32_t particleIndex = (*tIt)[i];
+					adjacencyCount[particleIndex] += 1;
+				}
+			}
+			//adjacentcyCount[i] now holds the number of triplets referring to particle i
 
-	TripletIter tIt, tEnd = mTriplets.end();
-	uint32_t tripletIndex;
+			mMaxAdjacentCount = *physx::shdfnd::maxElement(adjacencyCount.begin(), adjacencyCount.end());
 
-	// count number of triplets per particle
-	Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0));
-	for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt)
-		for (int i = 0; i < 3; ++i)
-			++adjacentCount[(*tIt)[i]];
+			// compute in place prefix sum (inclusive)
+			prefixSum(adjacencyCount.begin(), adjacencyCount.end(), mAdjacencyIndecies.begin());
+			// we use mAdjacencyIndecies from now on
+		}
 
-	/* neither of those were really improving number of batches:
-	// run simd version to pre-sort particles
-	simd(numParticles, blockWidth); mSetSizes.resize(0);
-	// sort according to triplet degree (estimated by sum of adjacentCount)
-	std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount));
-	*/
+		mAdjacencies.resize(mAdjacencyIndecies.back());
+		uint32_t tripletIndex = 0;
+		for(ConstTripletIter tIt = triplets.begin(); tIt != triplets.end(); ++tIt, ++tripletIndex)
+		{
+			for(int i = 0; i < 3; i++) //for each index in the triplet
+			{
+				const uint32_t particleIndex = (*tIt)[i];
 
-	uint32_t maxTripletCount = *shdfnd::maxElement(adjacentCount.begin(), adjacentCount.end());
+				//Decrement mAdjacencyIndecies here to convert it from inclusive to exclusive prefix sum
+				const uint32_t adjacencyIndex = --mAdjacencyIndecies[particleIndex];
 
-	// compute in place prefix sum (inclusive)
-	prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin());
+				mAdjacencies[adjacencyIndex] = tripletIndex;
+			}
+		}
+		//mAdjacencies[mAdjacencyIndecies[i]] now stores the first triplet index referring to particle i
+		//mAdjacencies[mAdjacencyIndecies[i+1]] stores one past the last
+	}
 
-	// initialize adjacencies (for each particle, collect touching triplets)
-	// also converts partial sum in adjacentCount from inclusive to exclusive
-	Vector<uint32_t>::Type adjacencies(adjacentCount.back());
-	for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
-		for (int i = 0; i < 3; ++i)
-			adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex;
+	uint32_t getMaxAdjacentCount() const
+	{ 
+		return mMaxAdjacentCount;
+	}
+
+	nv::cloth::Range<const uint32_t> getAdjacentTriplets(uint32_t particleIndex) const
+	{
+		//use .begin() + offset to get around bounds check
+		auto begin = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex]);
+		auto end = &*(mAdjacencies.begin() + mAdjacencyIndecies[particleIndex + 1]);
+		return nv::cloth::Range<const uint32_t>(begin, end);
+	}
+};
+
+}
+
+// CUDA version. Doesn't add padding, optimizes for bank conflicts
+void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+	// warpWidth has to be <= 32 and a power of two
+	NV_CLOTH_ASSERT(warpWidth == 32 || warpWidth == 16);
+
+	if (mTriplets.empty())
+		return;
+
+	AdjacencyQuerier adjacencyQuerier(mTriplets, numParticles);
 
 	uint32_t warpMask = warpWidth - 1;
 
-	uint32_t numSets = maxTripletCount; // start with minimum number of sets
+	uint32_t numSets = adjacencyQuerier.getMaxAdjacentCount(); // start with minimum number of sets
 	Vector<TripletSet>::Type sets(numSets);
+
+	// maps triplets to the set index that contains the triplet (setIndex = setIndices[tripletIndex])
 	Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1));
+
 	mSetSizes.resize(numSets);
 
+	uint32_t tripletIndex = 0;
+	TripletIter tIt, tEnd = mTriplets.end();
+
 	// color triplets (assign to sets)
-	Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd;
-	for (tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+	for (tIt = mTriplets.begin(); tIt != tEnd; ++tIt, ++tripletIndex)
 	{
-		// mark sets of adjacent triplets
+		// mark sets with adjacent triplets
 		for (int i = 0; i < 3; ++i)
 		{
 			uint32_t particleIndex = (*tIt)[i];
-			aIt = aBegin + adjacentCount[particleIndex];
-			aEnd = aBegin + adjacentCount[particleIndex + 1];
-			for (uint32_t setIndex; aIt != aEnd; ++aIt)
-				if (numSets > (setIndex = setIndices[*aIt]))
+			Range<const uint32_t> adjacentTriplets = adjacencyQuerier.getAdjacentTriplets(particleIndex);
+			
+			//for all triplets adjacent to (sharing) particle 'particleIndex'
+			for (; adjacentTriplets.begin() != adjacentTriplets.end(); adjacentTriplets.popFront())
+			{
+				uint32_t setIndex;
+				if(numSets > (setIndex = setIndices[adjacentTriplets.front()]))
+				{
+					//the set 'setIndex' has an adjacent triplet in it (sharing at least one vertex with tripletIndex)
 					sets[setIndex].mMark = tripletIndex;
+				}
+			}
 		}
 
-		// find valid set with smallest number of bank conflicts
+		// find valid set with smallest number of additional replays when adding triplet tIt
 		uint32_t bestIndex = numSets;
-		uint32_t minReplays = 4;
-		for (uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex)
+		uint32_t minAdditionalReplays = 4; // one more than the maximum possible (for 3 particles)
+		for (uint32_t setIndex = 0; setIndex < numSets && minAdditionalReplays; ++setIndex)
 		{
 			const TripletSet& set = sets[setIndex];
 
 			if (set.mMark == tripletIndex)
-				continue; // triplet collision
+				continue; // triplet collision (this set contains an adjacent triplet)
 
+			// count additional replays caused by inserting triplet 'tIt' to in set 'setIndex'
 			uint32_t numReplays = 0;
-			for (uint32_t i = 0; i < 3; ++i)
-				numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]];
+			for(uint32_t i = 0; i < 3; ++i) //for each particle in the triplet
+			{
+				const uint32_t particleIndex = (*tIt)[i];
+				const uint32_t bank = warpMask & particleIndex;
 
-			if (minReplays > numReplays)
+				// an additional replay will only be caused if the additional bank conflict is
+				//  on a bank that is the current bottleneck
+				numReplays += set.mNumReplays[i] == set.mNumConflicts[i][bank];
+			}
+
+			// hold on to this set if it has less additional replays compared to our current best set
+			if (minAdditionalReplays > numReplays)
 			{
-				minReplays = numReplays;
+				minAdditionalReplays = numReplays;
 				bestIndex = setIndex;
 			}
 		}
@@ -221,18 +374,26 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
 			++numSets;
 		}
 
-		// increment bank conflicts or reset if warp filled
+		// increment mNumReplays or reset if warp filled
 		TripletSet& set = sets[bestIndex];
 		if (++mSetSizes[bestIndex] & warpMask)
+		{
 			for (uint32_t i = 0; i < 3; ++i)
-				set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]);
+			{
+				const uint32_t particleIndex = (*tIt)[i];
+				const uint32_t bank = warpMask & particleIndex;
+				set.mNumReplays[i] = std::max(set.mNumReplays[i], ++set.mNumConflicts[i][bank]);
+			}
+		}
 		else
+		{ 
 			set = TripletSet();
+		}
 
 		setIndices[tripletIndex] = bestIndex;
 	}
 
-	// reorder triplets
+	// reorder triplets so that the sets are in continuous memory
 	Vector<uint32_t>::Type setOffsets(mSetSizes.size());
 	prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin());
 
@@ -243,3 +404,47 @@ void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
 
 	mTriplets.swap(triplets);
 }
+
+
+/* Interface used for unit tests */
+cloth::TripletSchedulerTestInterface::TripletSchedulerTestInterface(Range<const uint32_t[4]> triplets)
+	:
+	mScheduler(new cloth::TripletScheduler(triplets))
+{}
+
+cloth::TripletSchedulerTestInterface::~TripletSchedulerTestInterface()
+{
+	delete mScheduler;
+}
+void cloth::TripletSchedulerTestInterface::simd(uint32_t numParticles, uint32_t simdWidth)
+{
+	mScheduler->simd(numParticles, simdWidth);
+}
+void cloth::TripletSchedulerTestInterface::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+	mScheduler->warp(numParticles, warpWidth);
+}
+
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetTriplets()
+{
+	return Range<const uint32_t>(
+		reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin()),
+		reinterpret_cast<uint32_t*>(mScheduler->mTriplets.begin() + mScheduler->mTriplets.size())
+		); 
+}
+cloth::Range<const uint32_t> cloth::TripletSchedulerTestInterface::GetSetSizes()
+{
+	return Range<const uint32_t>(
+		mScheduler->mSetSizes.begin(),
+		mScheduler->mSetSizes.begin() + mScheduler->mSetSizes.size()
+		);
+}
+
+NV_CLOTH_API(nv::cloth::TripletSchedulerTestInterface*) NvClothCreateTripletScheduler(nv::cloth::Range<const uint32_t[4]> triplets)
+{
+	return new nv::cloth::TripletSchedulerTestInterface(triplets);
+}
+NV_CLOTH_API(void) NvClothDestroyTripletScheduler(nv::cloth::TripletSchedulerTestInterface* interface)
+{
+	delete interface;
+}