// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2018 NVIDIA Corporation. All rights reserved. #include "RTdef.h" #if RT_COMPILE /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** * Contains source code from the article "Radix Sort Revisited". * \file IceRevisitedRadix.cpp * \author Pierre Terdiman * \date April, 4, 2000 */ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** * Revisited Radix Sort. * This is my new radix routine: * - it uses indices and doesn't recopy the values anymore, hence wasting less ram * - it creates all the histograms in one run instead of four * - it sorts words faster than dwords and bytes faster than words * - it correctly sorts negative floating-point values by patching the offsets * - it automatically takes advantage of temporal coherence * - multiple keys support is a side effect of temporal coherence * - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway] * * History: * - 08.15.98: very first version * - 04.04.00: recoded for the radix article * - 12.xx.00: code lifting * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here) * - 10.11.01: added local ram support * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting...... * - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all. * - ranks are not "reset" anymore, but implicit on first calls * * \class RadixSort * \author Pierre Terdiman * \version 1.4 * \date August, 15, 1998 */ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /* To do: - add an offset parameter between two input values (avoid some data recopy sometimes) - unroll ? asm ? - 11 bits trick & 3 passes as Michael did - prefetch stuff the day I have a P3 */ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Precompiled Header #include "IceRevisitedRadixBase.h" namespace nvidia { namespace fracture { namespace base { #define INVALIDATE_RANKS mCurrentSize|=0x80000000 #define VALIDATE_RANKS mCurrentSize&=0x7fffffff #define CURRENT_SIZE (mCurrentSize&0x7fffffff) #define INVALID_RANKS (mCurrentSize&0x80000000) #define CHECK_RESIZE(n) \ if(n!=mPreviousSize) \ { \ if(n>mCurrentSize) Resize(n); \ else ResetRanks(); \ mPreviousSize = n; \ } #define CREATE_HISTOGRAMS(type, buffer) \ /* Clear counters/histograms */ \ \ memset(mHistogram, 0, 256*4*sizeof(uint32_t)); \ \ /* Prepare to count */ \ uint8_t* p = (uint8_t*)input; \ uint8_t* pe = &p[nb*4]; \ uint32_t* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \ uint32_t* h1= &mHistogram[256]; /* Histogram for second pass */ \ uint32_t* h2= &mHistogram[512]; /* Histogram for third pass */ \ uint32_t* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \ \ bool AlreadySorted = true; /* Optimism... */ \ \ if(INVALID_RANKS) \ { \ /* Prepare for temporal coherence */ \ type* Running = (type*)buffer; \ type PrevVal = *Running; \ \ while(p!=pe) \ { \ /* Read input buffer in previous sorted order */ \ type Val = *Running++;; \ /* Check whether already sorted or not */ \ if(Val= nb) return true; // Free previously used ram PX_FREE(mRanks2); PX_FREE(mRanks); // Get some fresh one mRanks = (uint32_t*) PX_ALLOC(sizeof(uint32_t)*nb,PX_DEBUG_EXP("RT_FRACTURE")); mRanks2 = (uint32_t*) PX_ALLOC(sizeof(uint32_t)*nb,PX_DEBUG_EXP("RT_FRACTURE")); mRanksSize = nb; return true; } inline void RadixSort::CheckResize(uint32_t nb) { uint32_t CurSize = CURRENT_SIZE; if(nb!=CurSize) { if(nb>CurSize) Resize(nb); mCurrentSize = nb; INVALIDATE_RANKS; } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** * Main sort routine. * This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data. * \param input [in] a list of integer values to sort * \param nb [in] number of values to sort, must be < 2^31 * \param signedvalues [in] true to handle negative values, false if you know your input buffer only contains positive values * \return Self-Reference */ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// RadixSort& RadixSort::Sort(const uint32_t* input, uint32_t nb, bool signedvalues) { // Checkings if(!input || !nb || nb&0x80000000) return *this; // Stats mTotalCalls++; // Resize lists if needed CheckResize(nb); // Create histograms (counters). Counters for all passes are created in one run. // Pros: read input buffer once instead of four times // Cons: mHistogram is 4Kb instead of 1Kb // We must take care of signed/unsigned values for temporal coherence.... I just // have 2 code paths even if just a single opcode changes. Self-modifying code, someone? if(!signedvalues) { CREATE_HISTOGRAMS(uint32_t, input); } else { CREATE_HISTOGRAMS(int32_t, input); } // Compute #negative values involved if needed uint32_t NbNegativeValues = 0; if(signedvalues) { // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte, // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers. uint32_t* h3= &mHistogram[768]; for(uint32_t i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part } // Radix sort, j is the pass number (0=LSB, 3=MSB) for(uint32_t j=0;j<4;j++) { CHECK_PASS_VALIDITY(j); // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is // not a problem, numbers are correctly sorted anyway. if(PerformPass) { // Should we care about negative values? if(j!=3 || !signedvalues) { // Here we deal with positive values only // Create offsets mOffset[0] = 0; for(uint32_t i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; } else { // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place. // Create biased offsets, in order for negative numbers to be sorted as well mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones for(uint32_t i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers // Fixing the wrong place for negative values mOffset[128] = 0; for(uint32_t i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; } // Perform Radix Sort uint8_t* InputBytes = (uint8_t*)input; InputBytes += j; if(INVALID_RANKS) { for(uint32_t i=0;i>24; // Radix byte, same as above. AND is useless here (uint32_t). // ### cmp to be killed. Not good. Later. if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order } VALIDATE_RANKS; } else { for(uint32_t i=0;i>24; // Radix byte, same as above. AND is useless here (uint32_t). // ### cmp to be killed. Not good. Later. if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order } } // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap. uint32_t* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp; } else { // The pass is useless, yet we still have to reverse the order of current list if all values are negative. if(UniqueVal>=128) { if(INVALID_RANKS) { // ###Possible? for(uint32_t i=0;i