// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "foundation/PxMemory.h" #include "SqBucketPruner.h" #include "GuIntersectionBoxBox.h" #include "GuInternal.h" #include "PsVecMath.h" #include "foundation/PxUnionCast.h" #include "CmRadixSortBuffered.h" #include "CmRenderOutput.h" #include "PsFPU.h" #include "PsBitUtils.h" #include "PsIntrinsics.h" #include "GuBounds.h" using namespace physx::shdfnd::aos; using namespace physx; using namespace Sq; using namespace Gu; using namespace Ps; #define INVALID_HANDLE 0xffffffff /* TODO: - if Core is always available, mSortedObjects could be replaced with just indices to mCoreObjects => less memory. - UTS: - test that queries against empty boxes all return false - invalidate after 16 removes - check shiftOrigin stuff (esp what happens to emptied boxes) - isn't there a very hard-to-find bug waiting to happen in there, when the shift touches the empty box and overrides mdata0/mdata1 with "wrong" values that break the sort? - revisit updateObject/removeObject - optimize/cache computation of free global bounds before clipRay - remove temp memory buffers (sorted arrays) - take care of code duplication - better code to generate SIMD 0x7fffffff - refactor SIMD tests - optimize: - better split values - optimize update (bitmap, less data copy, etc) - use ray limits in traversal code too? - the SIMD XBOX code operates on Min/Max rather than C/E. Change format? - or just try the alternative ray-box code (as on PC) ==> pretty much exactly the same speed */ //#define VERIFY_SORT //#define BRUTE_FORCE_LIMIT 32 #define LOCAL_SIZE 256 // Size of various local arrays. Dynamic allocations occur if exceeded. #define USE_SIMD // Use SIMD code or not (sanity performance check) #define NODE_SORT // Enable/disable node sorting #define NODE_SORT_MIN_COUNT 16 // Limit above which node sorting is performed #if PX_INTEL_FAMILY #if COMPILE_VECTOR_INTRINSICS #define CAN_USE_MOVEMASK #endif #endif #define ALIGN16(size) ((unsigned(size)+15) & unsigned(~15)) #ifdef _DEBUG #define AlignedLoad V4LoadU #define AlignedStore V4StoreU #else #define AlignedLoad V4LoadA #define AlignedStore V4StoreA #endif // SAT-based ray-box overlap test has accuracy issues for long rays, so we clip them against the global AABB to limit these issues. static void clipRay(const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, const PxVec3& boxMin, const PxVec3& boxMax) { const PxVec3 boxCenter = (boxMax + boxMin)*0.5f; const PxVec3 boxExtents = (boxMax - boxMin)*0.5f; const float dpc = boxCenter.dot(rayDir); const float extentsMagnitude = boxExtents.magnitude(); const float dpMin = dpc - extentsMagnitude; const float dpMax = dpc + extentsMagnitude; const float dpO = rayOrig.dot(rayDir); const float boxLength = extentsMagnitude * 2.0f; const float distToBox = PxMin(PxAbs(dpMin - dpO), PxAbs(dpMax - dpO)); maxDist = distToBox + boxLength * 2.0f; } BucketPrunerNode::BucketPrunerNode() { for(PxU32 i=0;i<5;i++) mBucketBox[i].setEmpty(); } static const PxU8 gCodes[] = { 4, 4, 4, 4, 4, 3, 2, 2, 4, 1, 0, 0, 4, 1, 0, 0, 4, 1, 0, 0, 2, 1, 0, 0, 3, 1, 0, 0, 2, 1, 0, 0}; #ifdef CAN_USE_MOVEMASK /*static PX_FORCE_INLINE PxU32 classifyBox_x86(const BucketBox& box, const PxVec4& limits, const bool useY, const bool isCrossBucket) { const Vec4V extents = AlignedLoad(&box.mExtents.x); const Vec4V center = AlignedLoad(&box.mCenter.x); const Vec4V plus = V4Add(extents, center); const Vec4V minus = V4Sub(extents, center); Vec4V tmp; if(useY) // PT: this is a constant so branch prediction works here tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,1,0,1)); else tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,2,0,2)); const Vec4V comp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,2,1,3)); // oh well, nm const PxU32 Code = (PxU32)_mm_movemask_ps(V4IsGrtr(V4LoadA(&limits.x), comp)); return gCodes[Code | PxU32(isCrossBucket)<<4]; }*/ static PX_FORCE_INLINE PxU32 classifyBox_x86(const Vec4V boxMin, const Vec4V boxMax, const PxVec4& limits, const bool useY, const bool isCrossBucket) { const Vec4V plus = boxMax; const Vec4V minus = V4Neg(boxMin); Vec4V tmp; if(useY) // PT: this is a constant so branch prediction works here tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,1,0,1)); else tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,2,0,2)); const Vec4V comp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,2,1,3)); // oh well, nm const PxU32 Code = PxU32(_mm_movemask_ps(V4IsGrtr(V4LoadA(&limits.x), comp))); return gCodes[Code | PxU32(isCrossBucket)<<4]; } #endif #ifdef CAN_USE_MOVEMASK #if PX_DEBUG #define USE_CLASSIFY_BOX #endif #else #define USE_CLASSIFY_BOX #endif #ifdef USE_CLASSIFY_BOX static PX_FORCE_INLINE PxU32 classifyBox(const BucketBox& box, const float limitX, const float limitYZ, const PxU32 yz, const bool isCrossBucket) { const bool upperPart = (box.mCenter[yz] + box.mExtents[yz])limitYZ; const bool leftPart = (box.mCenter.x + box.mExtents.x)limitX; // Table-based box classification avoids many branches const PxU32 Code = PxU32(rightPart)|(PxU32(leftPart)<<1)|(PxU32(lowerPart)<<2)|(PxU32(upperPart)<<3); return gCodes[Code + (isCrossBucket ? 16 : 0)]; } #endif void BucketPrunerNode::classifyBoxes( float limitX, float limitYZ, PxU32 nb, BucketBox* PX_RESTRICT boxes, const PrunerPayload* PX_RESTRICT objects, BucketBox* PX_RESTRICT sortedBoxes, PrunerPayload* PX_RESTRICT sortedObjects, bool isCrossBucket, PxU32 sortAxis) { const PxU32 yz = PxU32(sortAxis == 1 ? 2 : 1); #ifdef _DEBUG { float prev = boxes[0].mDebugMin; for(PxU32 i=1;i=prev); prev = current; } } #endif // Local (stack-based) min/max bucket bounds PX_ALIGN(16, PxVec4) bucketBoxMin[5]; PX_ALIGN(16, PxVec4) bucketBoxMax[5]; { const PxBounds3 empty = PxBounds3::empty(); for(PxU32 i=0;i<5;i++) { mCounters[i] = 0; bucketBoxMin[i] = PxVec4(empty.minimum, 0.0f); bucketBoxMax[i] = PxVec4(empty.maximum, 0.0f); } } { #ifdef CAN_USE_MOVEMASK // DS: order doesn't play nice with x86 shuffles :-| PX_ALIGN(16, PxVec4) limits(-limitX, limitX, -limitYZ, limitYZ); const bool useY = yz==1; #endif // Determine in which bucket each object falls, update bucket bounds for(PxU32 i=0;i=prev); prev = current; } } } #endif } /////////////////////////////////////////////////////////////////////////////// static void processChildBuckets(PxU32 nbAllocated, BucketBox* sortedBoxesInBucket, PrunerPayload* sortedObjectsInBucket, const BucketPrunerNode& bucket, BucketPrunerNode* PX_RESTRICT childBucket, BucketBox* PX_RESTRICT baseBucketsBoxes, PrunerPayload* PX_RESTRICT baseBucketsObjects, PxU32 sortAxis) { PX_UNUSED(nbAllocated); const PxU32 yz = PxU32(sortAxis == 1 ? 2 : 1); for(PxU32 i=0;i<5;i++) { const PxU32 nbInBucket = bucket.mCounters[i]; if(!nbInBucket) { childBucket[i].initCounters(); continue; } BucketBox* bucketsBoxes = baseBucketsBoxes + bucket.mOffsets[i]; PrunerPayload* bucketsObjects = baseBucketsObjects + bucket.mOffsets[i]; PX_ASSERT(nbInBucket<=nbAllocated); const float limitX = bucket.mBucketBox[i].mCenter.x; const float limitYZ = bucket.mBucketBox[i].mCenter[yz]; const bool isCrossBucket = i==4; childBucket[i].classifyBoxes(limitX, limitYZ, nbInBucket, bucketsBoxes, bucketsObjects, sortedBoxesInBucket, sortedObjectsInBucket, isCrossBucket, sortAxis); PxMemCopy(bucketsBoxes, sortedBoxesInBucket, sizeof(BucketBox)*nbInBucket); PxMemCopy(bucketsObjects, sortedObjectsInBucket, sizeof(PrunerPayload)*nbInBucket); } } /////////////////////////////////////////////////////////////////////////////// static PX_FORCE_INLINE PxU32 encodeFloat(PxU32 newPos) { //we may need to check on -0 and 0 //But it should make no practical difference. if(newPos & PX_SIGN_BITMASK) //negative? return ~newPos;//reverse sequence of negative numbers else return newPos | PX_SIGN_BITMASK; // flip sign } static PX_FORCE_INLINE void computeRayLimits(float& rayMin, float& rayMax, const PxVec3& rayOrig, const PxVec3& rayDir, float maxDist, PxU32 sortAxis) { const float rayOrigValue = rayOrig[sortAxis]; const float rayDirValue = rayDir[sortAxis] * maxDist; rayMin = PxMin(rayOrigValue, rayOrigValue + rayDirValue); rayMax = PxMax(rayOrigValue, rayOrigValue + rayDirValue); } static PX_FORCE_INLINE void computeRayLimits(float& rayMin, float& rayMax, const PxVec3& rayOrig, const PxVec3& rayDir, float maxDist, const PxVec3& inflate, PxU32 sortAxis) { const float inflateValue = inflate[sortAxis]; const float rayOrigValue = rayOrig[sortAxis]; const float rayDirValue = rayDir[sortAxis] * maxDist; rayMin = PxMin(rayOrigValue, rayOrigValue + rayDirValue) - inflateValue; rayMax = PxMax(rayOrigValue, rayOrigValue + rayDirValue) + inflateValue; } static PX_FORCE_INLINE void encodeBoxMinMax(BucketBox& box, const PxU32 axis) { const float min = box.mCenter[axis] - box.mExtents[axis]; const float max = box.mCenter[axis] + box.mExtents[axis]; const PxU32* binaryMin = reinterpret_cast(&min); const PxU32* binaryMax = reinterpret_cast(&max); box.mData0 = encodeFloat(binaryMin[0]); box.mData1 = encodeFloat(binaryMax[0]); } /////////////////////////////////////////////////////////////////////////////// BucketPrunerCore::BucketPrunerCore(bool externalMemory) : mCoreNbObjects (0), mCoreCapacity (0), mCoreBoxes (NULL), mCoreObjects (NULL), mCoreRemap (NULL), mSortedWorldBoxes (NULL), mSortedObjects (NULL), mNbFree (0), mSortedNb (0), mSortedCapacity (0), mSortAxis (0), mDirty (true), mOwnMemory (!externalMemory) { mGlobalBox.setEmpty(); mLevel1.initCounters(); for(PxU32 i=0;i<5;i++) mLevel2[i].initCounters(); for(PxU32 j=0;j<5;j++) for(PxU32 i=0;i<5;i++) mLevel3[j][i].initCounters(); } BucketPrunerCore::~BucketPrunerCore() { release(); } void BucketPrunerCore::release() { mDirty = true; mCoreNbObjects = 0; mCoreCapacity = 0; if(mOwnMemory) { PX_FREE_AND_RESET(mCoreBoxes); PX_FREE_AND_RESET(mCoreObjects); PX_FREE_AND_RESET(mCoreRemap); } PX_FREE_AND_RESET(mSortedWorldBoxes); PX_FREE_AND_RESET(mSortedObjects); mSortedNb = 0; mSortedCapacity = 0; mNbFree = 0; #ifdef USE_REGULAR_HASH_MAP mMap.clear(); #else mMap.purge(); #endif } void BucketPrunerCore::setExternalMemory(PxU32 nbObjects, PxBounds3* boxes, PrunerPayload* objects) { PX_ASSERT(!mOwnMemory); mCoreNbObjects = nbObjects; mCoreBoxes = boxes; mCoreObjects = objects; mCoreRemap = NULL; } void BucketPrunerCore::allocateSortedMemory(PxU32 nb) { mSortedNb = nb; if(nb<=mSortedCapacity && (nb>=mSortedCapacity/2)) return; const PxU32 capacity = Ps::nextPowerOfTwo(nb); mSortedCapacity = capacity; PxU32 bytesNeededForBoxes = capacity*sizeof(BucketBox); bytesNeededForBoxes = ALIGN16(bytesNeededForBoxes); PxU32 bytesNeededForObjects = capacity*sizeof(PrunerPayload); bytesNeededForObjects = ALIGN16(bytesNeededForObjects); PX_FREE(mSortedObjects); PX_FREE(mSortedWorldBoxes); mSortedWorldBoxes = reinterpret_cast(PX_ALLOC(bytesNeededForBoxes, "BucketPruner")); mSortedObjects = reinterpret_cast(PX_ALLOC(bytesNeededForObjects, "BucketPruner")); PX_ASSERT(!(size_t(mSortedWorldBoxes)&15)); PX_ASSERT(!(size_t(mSortedObjects)&15)); } /////////////////////////////////////////////////////////////////////////////// void BucketPrunerCore::resizeCore() { const PxU32 capacity = mCoreCapacity ? mCoreCapacity*2 : 32; mCoreCapacity = capacity; const PxU32 bytesNeededForBoxes = capacity*sizeof(PxBounds3); const PxU32 bytesNeededForObjects = capacity*sizeof(PrunerPayload); const PxU32 bytesNeededForRemap = capacity*sizeof(PxU32); PxBounds3* newCoreBoxes = reinterpret_cast(PX_ALLOC(bytesNeededForBoxes, "BucketPruner")); PrunerPayload* newCoreObjects = reinterpret_cast(PX_ALLOC(bytesNeededForObjects, "BucketPruner")); PxU32* newCoreRemap = reinterpret_cast(PX_ALLOC(bytesNeededForRemap, "BucketPruner")); if(mCoreBoxes) { PxMemCopy(newCoreBoxes, mCoreBoxes, mCoreNbObjects*sizeof(PxBounds3)); PX_FREE(mCoreBoxes); } if(mCoreObjects) { PxMemCopy(newCoreObjects, mCoreObjects, mCoreNbObjects*sizeof(PrunerPayload)); PX_FREE(mCoreObjects); } if(mCoreRemap) { PxMemCopy(newCoreRemap, mCoreRemap, mCoreNbObjects*sizeof(PxU32)); PX_FREE(mCoreRemap); } mCoreBoxes = newCoreBoxes; mCoreObjects = newCoreObjects; mCoreRemap = newCoreRemap; } PX_FORCE_INLINE void BucketPrunerCore::addObjectInternal(const PrunerPayload& object, const PxBounds3& worldAABB, PxU32 timeStamp) { if(mCoreNbObjects==mCoreCapacity) resizeCore(); const PxU32 index = mCoreNbObjects++; mCoreObjects[index] = object; mCoreBoxes[index] = worldAABB; // PT: TODO: check assembly here mCoreRemap[index] = 0xffffffff; // Objects are only inserted into the map once they're part of the main/core arrays. #ifdef USE_REGULAR_HASH_MAP bool ok = mMap.insert(object, BucketPrunerPair(index, timeStamp)); #else BucketPrunerPair* ok = mMap.addPair(object, index, timeStamp); #endif PX_UNUSED(ok); PX_ASSERT(ok); } bool BucketPrunerCore::addObject(const PrunerPayload& object, const PxBounds3& worldAABB, PxU32 timeStamp) { /* We should probably use a bigger Payload struct here, which would also contains the external handle. (EDIT: we can't even do that, because of the setExternalMemory function) When asked to update/remove an object it would be O(n) to find the proper object in the mSortedObjects array. - For removing it we can simply empty the corresponding box, and the object will never be returned from queries. Maybe this isn't even true, since boxes are sorted along one axis. So marking a box as empty could break the code relying on a sorted order. An alternative is to mark the external handle as invalid, and ignore the object when a hit is found. (EDIT: the sorting is now tested via data0/data1 anyway so we could mark the box as empty without breaking this) - For updating an object we would need to keep the (sub) array sorted (not the whole thing, only the array within a bucket). We don't know the range (what part of the array maps to our bucket) but we may have the bucket ID somewhere? If we'd have this we could parse the array left/right and resort just the right boxes. If we don't have this we may be able to "quickly" find the range by traversing the tree, looking for the proper bucket. In any case I don't think there's a mapping to update within a bucket, unlike in SAP or MBP. So we should be able to shuffle a bucket without having to update anything. For example there's no mapping between the Core array and the Sorted array. It's a shame in a way because we'd need one, but it's not there - and in fact I think we can free the Core array once Sorted is created, we don't need it at all. If we don't want to re-sort the full bucket we can just mark it as dirty and ignore the sort-based early exits in the queries. Then we can incrementally resort it over N frames or something. This only works if the updated object remains in the same bucket though. If it moves to another bucket it becomes tempting to just remove the object and re-insert it. - Now for adding an object, we can first have a "free pruner" and do the 16 next entries brute-force. Rebuilding every 16 objects might give a good speedup already. Otherwise we need to do something more complicated. */ PX_ASSERT(mOwnMemory); PX_ASSERT(!mDirty || !mNbFree); if(!mDirty) { // In this path the structure is marked as valid. We do not want to invalidate it for each new object... if(mNbFreesecond.mCoreIndex; timeStamp = removedEntry->second.mTimeStamp; #else PxU32 coreIndex; // This is the object's index in the core arrays. if(mMap.removePair(object, coreIndex, timeStamp)) { #endif // In this codepath, the object we want to remove exists in the core arrays. // We will need to remove it from both the core arrays & the sorted arrays. const PxU32 sortedIndex = mCoreRemap[coreIndex]; // This is the object's index in the sorted arrays. #ifdef USE_REGULAR_HASH_MAP bool status = mMap.erase(object); PX_ASSERT(status); PX_UNUSED(status); #endif // First let's deal with the core arrays mCoreNbObjects--; if(coreIndex!=mCoreNbObjects) { // If it wasn't the last object in the array, close the gaps as usual const PrunerPayload& movedObject = mCoreObjects[mCoreNbObjects]; mCoreBoxes[coreIndex] = mCoreBoxes[mCoreNbObjects]; mCoreObjects[coreIndex] = movedObject; mCoreRemap[coreIndex] = mCoreRemap[mCoreNbObjects]; // Since we just moved the last object, its index in the core arrays has changed. // We must reflect this change in the map. #ifdef USE_REGULAR_HASH_MAP BucketPrunerMap::Entry* movedEntry = const_cast(mMap.find(movedObject)); PX_ASSERT(movedEntry->second.mCoreIndex==mCoreNbObjects); movedEntry->second.mCoreIndex = coreIndex; #else BucketPrunerPair* movedEntry = const_cast(mMap.findPair(movedObject)); PX_ASSERT(movedEntry->mCoreIndex==mCoreNbObjects); movedEntry->mCoreIndex = coreIndex; #endif } // Now, let's deal with the sorted arrays. // If the structure is dirty, the sorted arrays will be rebuilt from scratch so there's no need to // update them right now. if(!mDirty) { // If the structure is valid, we want to keep it this way to avoid rebuilding sorted arrays after // each removal. We can't "close the gaps" easily here because order of objects in the arrays matters. // Instead we just invalidate the object by setting its bounding box as empty. // Queries against empty boxes will never return a hit, so this effectively "removes" the object // from any subsequent query results. Sorted arrays now contain a "disabled" object, until next build. // Invalidating the box does not invalidate the sorting, since it's now captured in mData0/mData1. // That is, mData0/mData1 keep their previous integer-encoded values, as if the box/object was still here. // PxBounds3 empty; // empty.setEmpty(); mSortedWorldBoxes[sortedIndex].mCenter = PxVec3(0.0f); mSortedWorldBoxes[sortedIndex].mExtents = PxVec3(-2.0f*PxSqrt(0.25f * 1e33f)); // PT: TODO: refactor value with similar one in SqAABBTree.cpp // Note that we don't touch mSortedObjects here. We could, but this is not necessary. } return true; } // Here, the object we want to remove exists in the free array. So we just parse it. for(PxU32 i=0;i(mMap.find(movedObject)); PX_ASSERT(movedEntry->second.mCoreIndex==coreNbObjects); movedEntry->second.mCoreIndex = coreIndex; #else BucketPrunerPair* movedEntry = const_cast(mMap.findPair(movedObject)); PX_ASSERT(movedEntry->mCoreIndex==coreNbObjects); movedEntry->mCoreIndex = coreIndex; #endif } nbRemoved++; #ifdef USE_REGULAR_HASH_MAP bool status = mMap.erase(p.first); PX_ASSERT(status); PX_UNUSED(status); #else const PxU32 hashValue = hash(p.mPayload) & mMap.mMask; mMap.removePairInternal(p.mPayload, hashValue, i); #endif nbActivePairs--; } else i++; } mCoreNbObjects = coreNbObjects; #ifdef USE_REGULAR_HASH_MAP #else mMap.shrinkMemory(); #endif } // PT: ...then we look in the 'free' array PxU32 i=0; while(i0); Vec4V mergedMinV = V4LoadU(&boxes[nb-1].minimum.x); Vec4V mergedMaxV = Vec4V_From_Vec3V(V3LoadU(&boxes[nb-1].maximum.x)); for(PxU32 i=0;i(sortedObjects); for(PxU32 i=0;i PX_CUDA_CALLABLE PX_FORCE_INLINE void tswap(T& x, T& y) { T tmp = x; x = y; y = tmp; } /* PX_FORCE_INLINE __m128 DotV(const __m128 a, const __m128 b) { const __m128 dot1 = _mm_mul_ps(a, b); const __m128 shuf1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(0,0,0,0))); const __m128 shuf2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(1,1,1,1))); const __m128 shuf3 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(2,2,2,2))); return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3); }*/ // PT: hmmm, by construction, isn't the order always the same for all bucket pruners? // => maybe not because the bucket boxes are still around the merged aabbs, not around the bucket // Still we could do something here static /*PX_FORCE_INLINE*/ PxU32 sort(const BucketPrunerNode& parent, const PxVec3& rayDir) { const PxU32 totalCount = parent.mCounters[0]+parent.mCounters[1]+parent.mCounters[2]+parent.mCounters[3]+parent.mCounters[4]; if(totalCount(dp); const PxU32* values = PxUnionCast(dp); PxU32 value0 = values[0]; PxU32 value1 = values[1]; PxU32 value2 = values[2]; PxU32 value3 = values[3]; PxU32 value4 = values[4]; for(PxU32 j=0;j<5-1;j++) { if(value1(dp); // const PxU32 mask = ~7U; const PxU32 mask = 0x7ffffff8; PxU32 value0 = (values[0]&mask); PxU32 value1 = (values[1]&mask)|1; PxU32 value2 = (values[2]&mask)|2; PxU32 value3 = (values[3]&mask)|3; PxU32 value4 = (values[4]&mask)|4; #define SORT_BLOCK \ if(value1(PX_ALLOC(nb*sizeof(size_t), "")); for(PxU32 i=0;iLOCAL_SIZE) { tempObjects = reinterpret_cast(PX_ALLOC(sizeof(PrunerPayload)*nb, "BucketPruner")); tempBoxes = reinterpret_cast(PX_ALLOC(nb*sizeof(BucketBox), "BucketPruner")); } else { tempObjects = localTempObjects; tempBoxes = localTempBoxes; } mSortAxis = sortBoxes(nb, mCoreBoxes, mCoreObjects, mGlobalBox, tempBoxes, tempObjects); PX_ASSERT(mSortAxis); allocateSortedMemory(nb); BucketBox* sortedBoxes = mSortedWorldBoxes; PrunerPayload* sortedObjects = mSortedObjects; const PxU32 yz = PxU32(mSortAxis == 1 ? 2 : 1); const float limitX = mGlobalBox.mCenter.x; const float limitYZ = mGlobalBox.mCenter[yz]; mLevel1.classifyBoxes(limitX, limitYZ, nb, tempBoxes, tempObjects, sortedBoxes, sortedObjects, false, mSortAxis); processChildBuckets(nb, tempBoxes, tempObjects, mLevel1, mLevel2, mSortedWorldBoxes, mSortedObjects, mSortAxis); for(PxU32 j=0;j<5;j++) processChildBuckets(nb, tempBoxes, tempObjects, mLevel2[j], mLevel3[j], mSortedWorldBoxes + mLevel1.mOffsets[j], mSortedObjects + mLevel1.mOffsets[j], mSortAxis); { for(PxU32 i=0;iLOCAL_SIZE) { PX_FREE(tempBoxes); PX_FREE(tempObjects); } for(PxU32 i=0;i(&MaskI)), DataV); _mm_store_ps(&rayParams->mData.x, DataV); _mm_store_ps(&rayParams->mData2.x, Data2V); _mm_store_ps(&rayParams->mFDir.x, FDirV); #else const PxVec3 data = 0.5f * rayDir * maxDist; rayParams->mData = data; rayParams->mData2 = rayOrig + data; rayParams->mFDir.x = PxAbs(data.x); rayParams->mFDir.y = PxAbs(data.y); rayParams->mFDir.z = PxAbs(data.z); #endif } template static PX_FORCE_INLINE IntBool _segmentAABB(const BucketBox& box, const RayParams* PX_RESTRICT params) { #ifdef USE_SIMD const PxU32 maskI = 0x7fffffff; const __m128 fdirV = _mm_load_ps(¶ms->mFDir.x); // #ifdef _DEBUG const __m128 extentsV = inflateT ? _mm_add_ps(_mm_loadu_ps(&box.mExtents.x), _mm_load_ps(¶ms->mInflate.x)) : _mm_loadu_ps(&box.mExtents.x); const __m128 DV = _mm_sub_ps(_mm_load_ps(¶ms->mData2.x), _mm_loadu_ps(&box.mCenter.x)); /* #else const __m128 extentsV = inflateT ? _mm_add_ps(_mm_load_ps(&box.mExtents.x), _mm_load_ps(¶ms->mInflate.x)) : _mm_load_ps(&box.mExtents.x); const __m128 DV = _mm_sub_ps(_mm_load_ps(¶ms->mData2.x), _mm_load_ps(&box.mCenter.x)); #endif*/ __m128 absDV = _mm_and_ps(DV, _mm_load1_ps(reinterpret_cast(&maskI))); absDV = _mm_cmpgt_ps(absDV, _mm_add_ps(extentsV, fdirV)); const PxU32 test = PxU32(_mm_movemask_ps(absDV)); if(test&7) return 0; const __m128 dataZYX_V = _mm_load_ps(¶ms->mData.x); const __m128 dataXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dataZYX_V), _MM_SHUFFLE(3,0,2,1))); const __m128 DXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(DV), _MM_SHUFFLE(3,0,2,1))); const __m128 fV = _mm_sub_ps(_mm_mul_ps(dataZYX_V, DXZY_V), _mm_mul_ps(dataXZY_V, DV)); const __m128 fdirZYX_V = _mm_load_ps(¶ms->mFDir.x); const __m128 fdirXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(fdirZYX_V), _MM_SHUFFLE(3,0,2,1))); const __m128 extentsXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(extentsV), _MM_SHUFFLE(3,0,2,1))); const __m128 fg = _mm_add_ps(_mm_mul_ps(extentsV, fdirXZY_V), _mm_mul_ps(extentsXZY_V, fdirZYX_V)); __m128 absfV = _mm_and_ps(fV, _mm_load1_ps(reinterpret_cast(&maskI))); absfV = _mm_cmpgt_ps(absfV, fg); const PxU32 test2 = PxU32(_mm_movemask_ps(absfV)); if(test2&7) return 0; return 1; #else const float boxExtentsx = inflateT ? box.mExtents.x + params->mInflate.x : box.mExtents.x; const float Dx = params->mData2.x - box.mCenter.x; if(fabsf(Dx) > boxExtentsx + params->mFDir.x) return IntFalse; const float boxExtentsz = inflateT ? box.mExtents.z + params->mInflate.z : box.mExtents.z; const float Dz = params->mData2.z - box.mCenter.z; if(fabsf(Dz) > boxExtentsz + params->mFDir.z) return IntFalse; const float boxExtentsy = inflateT ? box.mExtents.y + params->mInflate.y : box.mExtents.y; const float Dy = params->mData2.y - box.mCenter.y; if(fabsf(Dy) > boxExtentsy + params->mFDir.y) return IntFalse; float f; f = params->mData.y * Dz - params->mData.z * Dy; if(fabsf(f) > boxExtentsy*params->mFDir.z + boxExtentsz*params->mFDir.y) return IntFalse; f = params->mData.z * Dx - params->mData.x * Dz; if(fabsf(f) > boxExtentsx*params->mFDir.z + boxExtentsz*params->mFDir.x) return IntFalse; f = params->mData.x * Dy - params->mData.y * Dx; if(fabsf(f) > boxExtentsx*params->mFDir.y + boxExtentsy*params->mFDir.x) return IntFalse; return IntTrue; #endif } #else #include "SqPrunerTestsSIMD.h" typedef RayAABBTest BPRayAABBTest; template static PX_FORCE_INLINE IntBool _segmentAABB(const BucketBox& box, const BPRayAABBTest& test) { return static_cast(test.check(V3LoadU(box.mCenter), V3LoadU(box.mExtents))); } /*static PX_FORCE_INLINE IntBool _segmentAABB(const BucketBox& box, const BPRayAABBTest& test, PxU32 rayMinLimitX, PxU32 rayMaxLimitX) { if(rayMinLimitX>box.mData1) return 0; if(rayMaxLimitX static PxAgain processBucket( PxU32 nb, const BucketBox* PX_RESTRICT baseBoxes, PrunerPayload* PX_RESTRICT baseObjects, PxU32 offset, PxU32 totalAllocated, const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, #ifdef CAN_USE_MOVEMASK RayParams* PX_RESTRICT rayParams, #else BPRayAABBTest& test, const PxVec3& inflate, #endif PrunerCallback& pcb, PxU32& _rayMinLimitInt, PxU32& _rayMaxLimitInt, PxU32 sortAxis) { PX_UNUSED(totalAllocated); const BucketBox* PX_RESTRICT _boxes = baseBoxes + offset; PrunerPayload* PX_RESTRICT _objects = baseObjects + offset; PxU32 rayMinLimitInt = _rayMinLimitInt; PxU32 rayMaxLimitInt = _rayMaxLimitInt; const BucketBox* last = _boxes + nb; while(_boxes!=last) { const BucketBox& currentBox = *_boxes++; PrunerPayload* currentObject = _objects++; if(currentBox.mData1rayMaxLimitInt) goto Exit; #ifdef CAN_USE_MOVEMASK if(!_segmentAABB(currentBox, rayParams)) continue; #else if(!_segmentAABB(currentBox, test)) continue; #endif const float MaxDist = maxDist; const PxAgain again = pcb.invoke(maxDist, *currentObject); if(!again) return false; if(maxDist < MaxDist) { float rayMinLimit, rayMaxLimit; #ifdef CAN_USE_MOVEMASK if(inflateT) computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, rayParams->mInflate, sortAxis); else computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis); precomputeRayData(rayParams, rayOrig, rayDir, maxDist); #else if(inflateT) computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, inflate, sortAxis); else computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis); test.setDistance(maxDist); #endif const PxU32* binaryMinLimit = reinterpret_cast(&rayMinLimit); const PxU32* binaryMaxLimit = reinterpret_cast(&rayMaxLimit); rayMinLimitInt = encodeFloat(binaryMinLimit[0]); rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]); } } Exit: _rayMinLimitInt = rayMinLimitInt; _rayMaxLimitInt = rayMaxLimitInt; return true; } #ifdef NODE_SORT static PxU32 computeDirMask(const PxVec3& dir) { const PxU32* binary = reinterpret_cast(&dir.x); const PxU32 X = (binary[0])>>31; const PxU32 Y = (binary[1])>>31; const PxU32 Z = (binary[2])>>31; return Z|(Y<<1)|(X<<2); } #endif template static PxAgain stab(const BucketPrunerCore& core, PrunerCallback& pcb, const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, const PxVec3 inflate) { const PxU32 nb = core.mSortedNb; if(!nb && !core.mNbFree) return true; if(maxDist==PX_MAX_F32) { /*const*/ PxVec3 boxMin = core.mGlobalBox.getMin() - inflate; /*const*/ PxVec3 boxMax = core.mGlobalBox.getMax() + inflate; if(core.mNbFree) { // TODO: optimize this PxBounds3 freeGlobalBounds; freeGlobalBounds.setEmpty(); for(PxU32 i=0;i(tmp, &rayParams)) #else if(_segmentAABB(tmp, test)) #endif { if(!pcb.invoke(maxDist, core.mFreeObjects[i])) return false; } } if(!nb) return true; #ifdef CAN_USE_MOVEMASK if(!_segmentAABB(core.mGlobalBox, &rayParams)) return true; #else if(!_segmentAABB(core.mGlobalBox, test)) return true; #endif const PxU32 sortAxis = core.mSortAxis; float rayMinLimit, rayMaxLimit; if(inflateT) computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, inflate, sortAxis); else computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis); const PxU32* binaryMinLimit = reinterpret_cast(&rayMinLimit); const PxU32* binaryMaxLimit = reinterpret_cast(&rayMaxLimit); PxU32 rayMinLimitInt = encodeFloat(binaryMinLimit[0]); PxU32 rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]); /* float rayMinLimitX, rayMaxLimitX; if(inflateT) computeRayLimits(rayMinLimitX, rayMaxLimitX, rayOrig, rayDir, maxDist, inflate, 0); else computeRayLimits(rayMinLimitX, rayMaxLimitX, rayOrig, rayDir, maxDist, 0); PxU32 rayMinLimitIntX = encodeFloat(PX_IR(rayMinLimitX)); PxU32 rayMaxLimitIntX = encodeFloat(PX_IR(rayMaxLimitX)); */ float currentDist = maxDist; #ifdef NODE_SORT const PxU32 dirIndex = computeDirMask(rayDir); PxU32 orderi = core.mLevel1.mOrder[dirIndex]; // PxU32 orderi = sort(core.mLevel1, rayDir); for(PxU32 i_=0;i_<5;i_++) { const PxU32 i = orderi&7; orderi>>=3; #else for(PxU32 i=0;i<5;i++) { #endif #ifdef CAN_USE_MOVEMASK if(core.mLevel1.mCounters[i] && _segmentAABB(core.mLevel1.mBucketBox[i], &rayParams)) #else if(core.mLevel1.mCounters[i] && _segmentAABB(core.mLevel1.mBucketBox[i], test)) // if(core.mLevel1.mCounters[i] && _segmentAABB(core.mLevel1.mBucketBox[i], test, rayMinLimitIntX, rayMaxLimitIntX)) #endif { #ifdef NODE_SORT PxU32 orderj = core.mLevel2[i].mOrder[dirIndex]; // PxU32 orderj = sort(core.mLevel2[i], rayDir); for(PxU32 j_=0;j_<5;j_++) { const PxU32 j = orderj&7; orderj>>=3; #else for(PxU32 j=0;j<5;j++) { #endif #ifdef CAN_USE_MOVEMASK if(core.mLevel2[i].mCounters[j] && _segmentAABB(core.mLevel2[i].mBucketBox[j], &rayParams)) #else if(core.mLevel2[i].mCounters[j] && _segmentAABB(core.mLevel2[i].mBucketBox[j], test)) // if(core.mLevel2[i].mCounters[j] && _segmentAABB(core.mLevel2[i].mBucketBox[j], test, rayMinLimitIntX, rayMaxLimitIntX)) #endif { const BucketPrunerNode& parent = core.mLevel3[i][j]; const PxU32 parentOffset = core.mLevel1.mOffsets[i] + core.mLevel2[i].mOffsets[j]; #ifdef NODE_SORT PxU32 orderk = parent.mOrder[dirIndex]; // PxU32 orderk = sort(parent, rayDir); for(PxU32 k_=0;k_<5;k_++) { const PxU32 k = orderk&7; orderk>>=3; #else for(PxU32 k=0;k<5;k++) { #endif const PxU32 nbInBucket = parent.mCounters[k]; #ifdef CAN_USE_MOVEMASK if(nbInBucket && _segmentAABB(parent.mBucketBox[k], &rayParams)) #else if(nbInBucket && _segmentAABB(parent.mBucketBox[k], test)) // if(nbInBucket && _segmentAABB(parent.mBucketBox[k], test, rayMinLimitIntX, rayMaxLimitIntX)) #endif { const PxU32 offset = parentOffset + parent.mOffsets[k]; const PxAgain again = processBucket( nbInBucket, core.mSortedWorldBoxes, core.mSortedObjects, offset, core.mSortedNb, rayOrig, rayDir, currentDist, #ifdef CAN_USE_MOVEMASK &rayParams, #else test, inflate, #endif pcb, rayMinLimitInt, rayMaxLimitInt, sortAxis); if(!again) return false; } } } } } } maxDist = currentDist; return true; } PxAgain BucketPrunerCore::raycast(const PxVec3& origin, const PxVec3& unitDir, PxReal& inOutDistance, PrunerCallback& pcb) const { return ::stab<0>(*this, pcb, origin, unitDir, inOutDistance, PxVec3(0.0f)); } PxAgain BucketPrunerCore::sweep(const ShapeData& queryVolume, const PxVec3& unitDir, PxReal& inOutDistance, PrunerCallback& pcb) const { const PxVec3 extents = queryVolume.getPrunerInflatedWorldAABB().getExtents(); return ::stab<1>(*this, pcb, queryVolume.getPrunerInflatedWorldAABB().getCenter(), unitDir, inOutDistance, extents); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template static PX_FORCE_INLINE bool processBucket( PxU32 nb, const BucketBox* PX_RESTRICT baseBoxes, PrunerPayload* PX_RESTRICT baseObjects, PxU32 offset, PxU32 totalAllocated, const Test& test, PrunerCallback& pcb, PxU32 minLimitInt, PxU32 maxLimitInt) { PX_UNUSED(totalAllocated); const BucketBox* PX_RESTRICT boxes = baseBoxes + offset; PrunerPayload* PX_RESTRICT objects = baseObjects + offset; while(nb--) { const BucketBox& currentBox = *boxes++; PrunerPayload* currentObject = objects++; if(currentBox.mData1maxLimitInt) { if(doAssert) PX_ASSERT(!test(currentBox)); return true; } if(test(currentBox)) { PxReal dist = -1.0f; // no distance for overlaps if(!pcb.invoke(dist, *currentObject)) return false; } } return true; } template class BucketPrunerOverlapTraversal { public: PX_FORCE_INLINE BucketPrunerOverlapTraversal() {} /*PX_FORCE_INLINE*/ bool operator()(const BucketPrunerCore& core, const Test& test, PrunerCallback& pcb, const PxBounds3& cullBox) const { for(PxU32 i=0;i(&boxMinLimit); const PxU32* binaryMaxLimit = reinterpret_cast(&boxMaxLimit); const PxU32 rayMinLimitInt = encodeFloat(binaryMinLimit[0]); const PxU32 rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]); for(PxU32 i=0;i<5;i++) { if(core.mLevel1.mCounters[i] && test(core.mLevel1.mBucketBox[i])) { for(PxU32 j=0;j<5;j++) { if(core.mLevel2[i].mCounters[j] && test(core.mLevel2[i].mBucketBox[j])) { for(PxU32 k=0;k<5;k++) { const PxU32 nbInBucket = core.mLevel3[i][j].mCounters[k]; if(nbInBucket && test(core.mLevel3[i][j].mBucketBox[k])) { const PxU32 offset = core.mLevel1.mOffsets[i] + core.mLevel2[i].mOffsets[j] + core.mLevel3[i][j].mOffsets[k]; if(!processBucket(nbInBucket, core.mSortedWorldBoxes, core.mSortedObjects, offset, core.mSortedNb, test, pcb, rayMinLimitInt, rayMaxLimitInt)) return false; } } } } } } return true; } }; /////////////////////////////////////////////////////////////////////////////// #ifdef CAN_USE_MOVEMASK PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a) { const PxI32 moveMask = _mm_movemask_ps(a); return PxU32((moveMask & 0x7) == (0x7)); } #endif #ifdef USE_SIMD struct SphereAABBTest_SIMD { PX_FORCE_INLINE SphereAABBTest_SIMD(const Gu::Sphere& sphere) : #ifdef CAN_USE_MOVEMASK mCenter (V4LoadU(&sphere.center.x)), #else mCenter (V3LoadU(sphere.center)), #endif mRadius2(FLoad(sphere.radius * sphere.radius)) {} PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const { #ifdef CAN_USE_MOVEMASK const Vec4V boxCenter = AlignedLoad(&box.mCenter.x); const Vec4V boxExtents = AlignedLoad(&box.mExtents.x); // const Vec4V offset = V4Sub(mCenter, boxCenter); const Vec4V closest = V4Clamp(offset, V4Neg(boxExtents), boxExtents); const Vec4V d = V4Sub(offset, closest); const FloatV dot = V4Dot3(d,d); return Ps::IntBool(BAllTrue3_R(FIsGrtrOrEq(mRadius2, dot))); #else const Vec3V boxCenter = V3LoadU(box.mCenter); const Vec3V boxExtents = V3LoadU(box.mExtents); // const Vec3V offset = V3Sub(mCenter, boxCenter); const Vec3V closest = V3Clamp(offset, V3Neg(boxExtents), boxExtents); const Vec3V d = V3Sub(offset, closest); return Ps::IntBool(BAllEqTTTT(FIsGrtrOrEq(mRadius2, V3Dot(d, d)))); #endif } PX_FORCE_INLINE Ps::IntBool operator()(const PxBounds3& bounds) const { BucketBox tmp; tmp.mCenter = bounds.getCenter(); tmp.mExtents = bounds.getExtents(); return (*this)(tmp); } private: SphereAABBTest_SIMD& operator=(const SphereAABBTest_SIMD&); #ifdef CAN_USE_MOVEMASK const Vec4V mCenter; #else const Vec3V mCenter; #endif const FloatV mRadius2; }; #else struct SphereAABBTest_Scalar { PX_FORCE_INLINE SphereAABBTest_Scalar(const Gu::Sphere& sphere) : mCenter (sphere.center), mRadius2(sphere.radius * sphere.radius) {} PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const { const PxVec3 minimum = box.getMin(); const PxVec3 maximum = box.getMax(); float d = 0.0f; //find the square of the distance //from the sphere to the box for(PxU32 i=0;i<3;i++) { if(mCenter[i]maximum[i]) { const float s = mCenter[i] - maximum[i]; d += s*s; } } return d <= mRadius2; } private: SphereAABBTest_Scalar& operator=(const SphereAABBTest_Scalar&); const PxVec3 mCenter; float mRadius2; }; #endif #ifdef USE_SIMD typedef SphereAABBTest_SIMD BucketPrunerSphereAABBTest; #else typedef SphereAABBTest_Scalar BucketPrunerSphereAABBTest; #endif /////////////////////////////////////////////////////////////////////////////// struct BucketPrunerAABBAABBTest { PX_FORCE_INLINE BucketPrunerAABBAABBTest(const PxBounds3& queryBox) : mBox(queryBox) {} PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const { // PT: we don't use PxBounds3::intersects() because isValid() asserts on our empty boxes! const PxVec3 bucketMin = box.getMin(); const PxVec3 bucketMax = box.getMax(); return !(mBox.minimum.x > bucketMax.x || bucketMin.x > mBox.maximum.x || mBox.minimum.y > bucketMax.y || bucketMin.y > mBox.maximum.y || mBox.minimum.z > bucketMax.z || bucketMin.z > mBox.maximum.z); } PX_FORCE_INLINE Ps::IntBool operator()(const PxBounds3& bounds) const { // PT: we don't use PxBounds3::intersects() because isValid() asserts on our empty boxes! const PxVec3& bucketMin = bounds.minimum; const PxVec3& bucketMax = bounds.maximum; return !(mBox.minimum.x > bucketMax.x || bucketMin.x > mBox.maximum.x || mBox.minimum.y > bucketMax.y || bucketMin.y > mBox.maximum.y || mBox.minimum.z > bucketMax.z || bucketMin.z > mBox.maximum.z); } private: BucketPrunerAABBAABBTest& operator=(const BucketPrunerAABBAABBTest&); const PxBounds3 mBox; }; /*struct BucketPrunerAABBAABBTest_SIMD { PX_FORCE_INLINE BucketPrunerAABBAABBTest_SIMD(const PxBounds3& b) : mCenter(V3LoadU(b.getCenter())) , mExtents(V3LoadU(b.getExtents())) {} PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const { return V3AllGrtrOrEq(V3Add(mExtents, AlignedLoad(&box.mExtents.x)), V3Abs(V3Sub(AlignedLoad(&box.mCenter.x), mCenter))); } private: BucketPrunerAABBAABBTest_SIMD& operator=(const BucketPrunerAABBAABBTest_SIMD&); const Vec3V mCenter, mExtents; };*/ /////////////////////////////////////////////////////////////////////////////// #ifdef USE_SIMD struct OBBAABBTest_SIMD { OBBAABBTest_SIMD(const PxMat33& rotation, const PxVec3& translation, const PxVec3& extents) { const Vec3V eps = V3Load(1e-6f); mT = V3LoadU(translation); mExtents = V3LoadU(extents); // storing the transpose matrices yields a simpler SIMD test mRT = Mat33V_From_PxMat33(rotation.getTranspose()); mART = Mat33V(V3Add(V3Abs(mRT.col0), eps), V3Add(V3Abs(mRT.col1), eps), V3Add(V3Abs(mRT.col2), eps)); mBB_xyz = M33TrnspsMulV3(mART, mExtents); /* if(fullTest) { const Vec3V eYZX = V3PermYZX(mExtents), eZXY = V3PermZXY(mExtents); mBB_123 = V3MulAdd(eYZX, V3PermZXY(mART.col0), V3Mul(eZXY, V3PermYZX(mART.col0))); mBB_456 = V3MulAdd(eYZX, V3PermZXY(mART.col1), V3Mul(eZXY, V3PermYZX(mART.col1))); mBB_789 = V3MulAdd(eYZX, V3PermZXY(mART.col2), V3Mul(eZXY, V3PermYZX(mART.col2))); }*/ } PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const { const Vec3V extentsV = V3LoadU(box.mExtents); const Vec3V t = V3Sub(mT, V3LoadU(box.mCenter)); // class I - axes of AABB if(V3OutOfBounds(t, V3Add(extentsV, mBB_xyz))) return Ps::IntFalse; const Vec3V rX = mRT.col0, rY = mRT.col1, rZ = mRT.col2; const Vec3V arX = mART.col0, arY = mART.col1, arZ = mART.col2; const FloatV eX = V3GetX(extentsV), eY = V3GetY(extentsV), eZ = V3GetZ(extentsV); const FloatV tX = V3GetX(t), tY = V3GetY(t), tZ = V3GetZ(t); // class II - axes of OBB { const Vec3V v = V3ScaleAdd(rZ, tZ, V3ScaleAdd(rY, tY, V3Scale(rX, tX))); const Vec3V v2 = V3ScaleAdd(arZ, eZ, V3ScaleAdd(arY, eY, V3ScaleAdd(arX, eX, mExtents))); if(V3OutOfBounds(v, v2)) return Ps::IntFalse; } // if(!fullTest) return Ps::IntTrue; /* // class III - edge cross products. Almost all OBB tests early-out with type I or type II, // so early-outs here probably aren't useful (TODO: profile) const Vec3V va = V3NegScaleSub(rZ, tY, V3Scale(rY, tZ)); const Vec3V va2 = V3ScaleAdd(arY, eZ, V3ScaleAdd(arZ, eY, mBB_123)); const BoolV ba = BOr(V3IsGrtr(va, va2), V3IsGrtr(V3Neg(va2), va)); const Vec3V vb = V3NegScaleSub(rX, tZ, V3Scale(rZ, tX)); const Vec3V vb2 = V3ScaleAdd(arX, eZ, V3ScaleAdd(arZ, eX, mBB_456)); const BoolV bb = BOr(V3IsGrtr(vb, vb2), V3IsGrtr(V3Neg(vb2), vb)); const Vec3V vc = V3NegScaleSub(rY, tX, V3Scale(rX, tY)); const Vec3V vc2 = V3ScaleAdd(arX, eY, V3ScaleAdd(arY, eX, mBB_789)); const BoolV bc = BOr(V3IsGrtr(vc, vc2), V3IsGrtr(V3Neg(vc2), vc)); return BAllEq(BOr(ba, BOr(bb,bc)), BFFFF());*/ } PX_FORCE_INLINE Ps::IntBool operator()(const PxBounds3& bounds) const { BucketBox tmp; tmp.mCenter = bounds.getCenter(); tmp.mExtents = bounds.getExtents(); return (*this)(tmp); } Vec3V mExtents; // extents of OBB Vec3V mT; // translation of OBB Mat33V mRT; // transpose of rotation matrix of OBB Mat33V mART; // transpose of mRT, padded by epsilon Vec3V mBB_xyz; // extents of OBB along coordinate axes /* Vec3V mBB_123; // projections of extents onto edge-cross axes Vec3V mBB_456; Vec3V mBB_789;*/ }; #else struct OBBAABBTest_Scalar { OBBAABBTest_Scalar(const PxMat33& rotation, const PxVec3& translation, const PxVec3& extents) { mR = rotation; mT = translation; mExtents = extents; const PxVec3 eps(1e-6f); mAR = PxMat33(mR[0].abs() + eps, mR[1].abs() + eps, mR[2].abs() + eps); // Epsilon prevents floating-point inaccuracies (strategy borrowed from RAPID) mBB_xyz = mAR.transform(mExtents); // Precompute box-box data - Courtesy of Erwin de Vries /* PxReal ex = mExtents.x, ey = mExtents.y, ez = mExtents.z; mBB_1 = ey*mAR[2].x + ez*mAR[1].x; mBB_2 = ez*mAR[0].x + ex*mAR[2].x; mBB_3 = ex*mAR[1].x + ey*mAR[0].x; mBB_4 = ey*mAR[2].y + ez*mAR[1].y; mBB_5 = ez*mAR[0].y + ex*mAR[2].y; mBB_6 = ex*mAR[1].y + ey*mAR[0].y; mBB_7 = ey*mAR[2].z + ez*mAR[1].z; mBB_8 = ez*mAR[0].z + ex*mAR[2].z; mBB_9 = ex*mAR[1].z + ey*mAR[0].z;*/ } PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const { const PxVec3& c = box.mCenter; const PxVec3& e = box.mExtents; const PxVec3 T = mT - c; // Class I : A's basis vectors if(PxAbs(T.x) > e.x + mBB_xyz.x) return Ps::IntFalse; if(PxAbs(T.y) > e.y + mBB_xyz.y) return Ps::IntFalse; if(PxAbs(T.z) > e.z + mBB_xyz.z) return Ps::IntFalse; // Class II : B's basis vectors if(PxAbs(T.dot(mR[0])) > e.dot(mAR[0]) + mExtents.x) return Ps::IntFalse; if(PxAbs(T.dot(mR[1])) > e.dot(mAR[1]) + mExtents.y) return Ps::IntFalse; if(PxAbs(T.dot(mR[2])) > e.dot(mAR[2]) + mExtents.z) return Ps::IntFalse; // Class III : 9 cross products if(0) { if(PxAbs(T.z*mR[0].y - T.y*mR[0].z) > e.y*mAR[0].z + e.z*mAR[0].y + mBB_1) return Ps::IntFalse; // L = A0 x B0 if(PxAbs(T.z*mR[1].y - T.y*mR[1].z) > e.y*mAR[1].z + e.z*mAR[1].y + mBB_2) return Ps::IntFalse; // L = A0 x B1 if(PxAbs(T.z*mR[2].y - T.y*mR[2].z) > e.y*mAR[2].z + e.z*mAR[2].y + mBB_3) return Ps::IntFalse; // L = A0 x B2 if(PxAbs(T.x*mR[0].z - T.z*mR[0].x) > e.x*mAR[0].z + e.z*mAR[0].x + mBB_4) return Ps::IntFalse; // L = A1 x B0 if(PxAbs(T.x*mR[1].z - T.z*mR[1].x) > e.x*mAR[1].z + e.z*mAR[1].x + mBB_5) return Ps::IntFalse; // L = A1 x B1 if(PxAbs(T.x*mR[2].z - T.z*mR[2].x) > e.x*mAR[2].z + e.z*mAR[2].x + mBB_6) return Ps::IntFalse; // L = A1 x B2 if(PxAbs(T.y*mR[0].x - T.x*mR[0].y) > e.x*mAR[0].y + e.y*mAR[0].x + mBB_7) return Ps::IntFalse; // L = A2 x B0 if(PxAbs(T.y*mR[1].x - T.x*mR[1].y) > e.x*mAR[1].y + e.y*mAR[1].x + mBB_8) return Ps::IntFalse; // L = A2 x B1 if(PxAbs(T.y*mR[2].x - T.x*mR[2].y) > e.x*mAR[2].y + e.y*mAR[2].x + mBB_9) return Ps::IntFalse; // L = A2 x B2 } return Ps::IntTrue; } private: PxMat33 mR; // rotation matrix PxMat33 mAR; // absolute rotation matrix PxVec3 mT; // translation from obb space to model space PxVec3 mExtents; PxVec3 mBB_xyz; float mBB_1, mBB_2, mBB_3; float mBB_4, mBB_5, mBB_6; float mBB_7, mBB_8, mBB_9; }; #endif #ifdef USE_SIMD typedef OBBAABBTest_SIMD BucketPrunerOBBAABBTest; #else typedef OBBAABBTest_Scalar BucketPrunerOBBAABBTest; #endif /////////////////////////////////////////////////////////////////////////////// PxAgain BucketPrunerCore::overlap(const ShapeData& queryVolume, PrunerCallback& pcb) const { PX_ASSERT(!mDirty); PxAgain again = true; const PxBounds3& cullBox = queryVolume.getPrunerInflatedWorldAABB(); switch(queryVolume.getType()) { case PxGeometryType::eBOX: { if(queryVolume.isOBB()) { const BucketPrunerOverlapTraversal overlap; again = overlap(*this, BucketPrunerOBBAABBTest( queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(), queryVolume.getPrunerBoxGeomExtentsInflated()), pcb, cullBox); } else { const BucketPrunerOverlapTraversal overlap; again = overlap(*this, BucketPrunerAABBAABBTest(cullBox), pcb, cullBox); } } break; case PxGeometryType::eCAPSULE: { const BucketPrunerOverlapTraversal overlap; again = overlap(*this, BucketPrunerOBBAABBTest( queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(), queryVolume.getPrunerBoxGeomExtentsInflated()), pcb, cullBox); } break; case PxGeometryType::eSPHERE: { const Sphere& sphere = queryVolume.getGuSphere(); const PxVec3 sphereExtents(sphere.radius); const BucketPrunerOverlapTraversal overlap; again = overlap(*this, BucketPrunerSphereAABBTest(sphere), pcb, cullBox); } break; case PxGeometryType::eCONVEXMESH: { const BucketPrunerOverlapTraversal overlap; again = overlap(*this, BucketPrunerOBBAABBTest( queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(), queryVolume.getPrunerBoxGeomExtentsInflated()), pcb, cullBox); } break; case PxGeometryType::ePLANE: case PxGeometryType::eTRIANGLEMESH: case PxGeometryType::eHEIGHTFIELD: case PxGeometryType::eGEOMETRY_COUNT: case PxGeometryType::eINVALID: PX_ALWAYS_ASSERT_MESSAGE("unsupported overlap query volume geometry type"); } return again; } /////////////////////////////////////////////////////////////////////////////// void BucketPrunerCore::shiftOrigin(const PxVec3& shift) { for(PxU32 i=0;i the pair is persistent return &activePairs[offset]; } // Internal version saving hash computation PX_FORCE_INLINE BucketPrunerPair* BucketPrunerMap::findPair(const PrunerPayload& payload, PxU32 hashValue) const { if(!mHashTable) return NULL; // Nothing has been allocated yet BucketPrunerPair* PX_RESTRICT activePairs = mActivePairs; const PxU32* PX_RESTRICT next = mNext; // Look for it in the table PxU32 offset = mHashTable[hashValue]; while(offset!=INVALID_ID && differentPair(activePairs[offset], payload)) { offset = next[offset]; // Better to have a separate array for this } if(offset==INVALID_ID) return NULL; PX_ASSERT(offset the pair is persistent return &activePairs[offset]; } /////////////////////////////////////////////////////////////////////////////// BucketPrunerPair* BucketPrunerMap::addPair(const PrunerPayload& payload, PxU32 coreIndex, PxU32 timeStamp) { PxU32 hashValue = hash(payload) & mMask; { BucketPrunerPair* PX_RESTRICT p = findPair(payload, hashValue); if(p) { PX_ASSERT(p->mCoreIndex==coreIndex); PX_ASSERT(p->mTimeStamp==timeStamp); return p; // Persistent pair } } // This is a new pair if(mNbActivePairs >= mHashSize) { // Get more entries mHashSize = Ps::nextPowerOfTwo(mNbActivePairs+1); mMask = mHashSize-1; reallocPairs(); // Recompute hash value with new hash size hashValue = hash(payload) & mMask; // ### redundant hash computation here? } BucketPrunerPair* PX_RESTRICT p = &mActivePairs[mNbActivePairs]; p->mPayload = payload; p->mCoreIndex = coreIndex; p->mTimeStamp = timeStamp; mNext[mNbActivePairs] = mHashTable[hashValue]; mHashTable[hashValue] = mNbActivePairs++; return p; } /////////////////////////////////////////////////////////////////////////////// void BucketPrunerMap::removePairInternal(const PrunerPayload& /*payload*/, PxU32 hashValue, PxU32 pairIndex) { // Walk the hash table to fix mNext { PxU32 offset = mHashTable[hashValue]; PX_ASSERT(offset!=INVALID_ID); PxU32 previous=INVALID_ID; while(offset!=pairIndex) { previous = offset; offset = mNext[offset]; } // Let us go/jump us if(previous!=INVALID_ID) { PX_ASSERT(mNext[previous]==pairIndex); mNext[previous] = mNext[pairIndex]; } // else we were the first else mHashTable[hashValue] = mNext[pairIndex]; // we're now free to reuse mNext[pairIndex] without breaking the list } #if PX_DEBUG mNext[pairIndex]=INVALID_ID; #endif // Invalidate entry // Fill holes if(1) { // 1) Remove last pair const PxU32 lastPairIndex = mNbActivePairs-1; if(lastPairIndex==pairIndex) { mNbActivePairs--; } else { const BucketPrunerPair* last = &mActivePairs[lastPairIndex]; const PxU32 lastHashValue = hash(last->mPayload) & mMask; // Walk the hash table to fix mNext PxU32 offset = mHashTable[lastHashValue]; PX_ASSERT(offset!=INVALID_ID); PxU32 previous=INVALID_ID; while(offset!=lastPairIndex) { previous = offset; offset = mNext[offset]; } // Let us go/jump us if(previous!=INVALID_ID) { PX_ASSERT(mNext[previous]==lastPairIndex); mNext[previous] = mNext[lastPairIndex]; } // else we were the first else mHashTable[lastHashValue] = mNext[lastPairIndex]; // we're now free to reuse mNext[lastPairIndex] without breaking the list #if PX_DEBUG mNext[lastPairIndex]=INVALID_ID; #endif // Don't invalidate entry since we're going to shrink the array // 2) Re-insert in free slot mActivePairs[pairIndex] = mActivePairs[lastPairIndex]; #if PX_DEBUG PX_ASSERT(mNext[pairIndex]==INVALID_ID); #endif mNext[pairIndex] = mHashTable[lastHashValue]; mHashTable[lastHashValue] = pairIndex; mNbActivePairs--; } } } /////////////////////////////////////////////////////////////////////////////// bool BucketPrunerMap::removePair(const PrunerPayload& payload, PxU32& coreIndex, PxU32& timeStamp) { const PxU32 hashValue = hash(payload) & mMask; const BucketPrunerPair* p = findPair(payload, hashValue); if(!p) return false; PX_ASSERT(p->mPayload==payload); coreIndex = p->mCoreIndex; timeStamp = p->mTimeStamp; removePairInternal(payload, hashValue, getPairIndex(p)); shrinkMemory(); return true; } /////////////////////////////////////////////////////////////////////////////// void BucketPrunerMap::shrinkMemory() { // Check correct memory against actually used memory const PxU32 correctHashSize = Ps::nextPowerOfTwo(mNbActivePairs); if(mHashSize==correctHashSize) return; if(mReservedMemory && correctHashSize < mReservedMemory) return; // Reduce memory used mHashSize = correctHashSize; mMask = mHashSize-1; reallocPairs(); } /////////////////////////////////////////////////////////////////////////////// static PX_FORCE_INLINE void storeDwords(PxU32* dest, PxU32 nb, PxU32 value) { while(nb--) *dest++ = value; } void BucketPrunerMap::reallocPairs() { MBP_FREE(mHashTable); mHashTable = reinterpret_cast(MBP_ALLOC(mHashSize*sizeof(PxU32))); storeDwords(mHashTable, mHashSize, INVALID_ID); // Get some bytes for new entries BucketPrunerPair* newPairs = reinterpret_cast(MBP_ALLOC(mHashSize * sizeof(BucketPrunerPair))); PX_ASSERT(newPairs); PxU32* newNext = reinterpret_cast(MBP_ALLOC(mHashSize * sizeof(PxU32))); PX_ASSERT(newNext); // Copy old data if needed if(mNbActivePairs) PxMemCopy(newPairs, mActivePairs, mNbActivePairs*sizeof(BucketPrunerPair)); // ### check it's actually needed... probably only for pairs whose hash value was cut by the and // yeah, since hash(id0, id1) is a constant // However it might not be needed to recompute them => only less efficient but still ok for(PxU32 i=0;i