From 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 Mon Sep 17 00:00:00 2001 From: git perforce import user Date: Tue, 25 Oct 2016 12:29:14 -0600 Subject: Initial commit: PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167] --- PhysX_3.4/Source/SceneQuery/src/SqBucketPruner.cpp | 2601 ++++++++++++++++++++ 1 file changed, 2601 insertions(+) create mode 100644 PhysX_3.4/Source/SceneQuery/src/SqBucketPruner.cpp (limited to 'PhysX_3.4/Source/SceneQuery/src/SqBucketPruner.cpp') diff --git a/PhysX_3.4/Source/SceneQuery/src/SqBucketPruner.cpp b/PhysX_3.4/Source/SceneQuery/src/SqBucketPruner.cpp new file mode 100644 index 00000000..35a5ca13 --- /dev/null +++ b/PhysX_3.4/Source/SceneQuery/src/SqBucketPruner.cpp @@ -0,0 +1,2601 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#include "foundation/PxMemory.h" +#include "SqBucketPruner.h" +#include "GuIntersectionBoxBox.h" +#include "GuInternal.h" +#include "PsVecMath.h" +#include "foundation/PxUnionCast.h" +#include "CmRadixSortBuffered.h" +#include "CmRenderOutput.h" +#include "PsFPU.h" +#include "PsBitUtils.h" +#include "PsIntrinsics.h" +#include "GuBounds.h" + +using namespace physx::shdfnd::aos; + +using namespace physx; +using namespace Sq; +using namespace Gu; +using namespace Ps; + +#define INVALID_HANDLE 0xffffffff + +/* +TODO: +- if Core is always available, mSortedObjects could be replaced with just indices to mCoreObjects => less memory. +- UTS: + - test that queries against empty boxes all return false +- invalidate after 16 removes +- check shiftOrigin stuff (esp what happens to emptied boxes) + - isn't there a very hard-to-find bug waiting to happen in there, + when the shift touches the empty box and overrides mdata0/mdata1 with "wrong" values that break the sort? +- revisit updateObject/removeObject +- optimize/cache computation of free global bounds before clipRay + +- remove temp memory buffers (sorted arrays) +- take care of code duplication +- better code to generate SIMD 0x7fffffff +- refactor SIMD tests +- optimize: + - better split values + - optimize update (bitmap, less data copy, etc) + - use ray limits in traversal code too? + - the SIMD XBOX code operates on Min/Max rather than C/E. Change format? + - or just try the alternative ray-box code (as on PC) ==> pretty much exactly the same speed +*/ + +//#define VERIFY_SORT +//#define BRUTE_FORCE_LIMIT 32 +#define LOCAL_SIZE 256 // Size of various local arrays. Dynamic allocations occur if exceeded. +#define USE_SIMD // Use SIMD code or not (sanity performance check) +#define NODE_SORT // Enable/disable node sorting +#define NODE_SORT_MIN_COUNT 16 // Limit above which node sorting is performed +#if PX_INTEL_FAMILY + #if COMPILE_VECTOR_INTRINSICS + #define CAN_USE_MOVEMASK + #endif +#endif + +#define ALIGN16(size) ((unsigned(size)+15) & unsigned(~15)) + +#ifdef _DEBUG + #define AlignedLoad V4LoadU + #define AlignedStore V4StoreU +#else + #define AlignedLoad V4LoadA + #define AlignedStore V4StoreA +#endif + +// SAT-based ray-box overlap test has accuracy issues for long rays, so we clip them against the global AABB to limit these issues. +static void clipRay(const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, const PxVec3& boxMin, const PxVec3& boxMax) +{ + const PxVec3 boxCenter = (boxMax + boxMin)*0.5f; + const PxVec3 boxExtents = (boxMax - boxMin)*0.5f; + const float dpc = boxCenter.dot(rayDir); + const float extentsMagnitude = boxExtents.magnitude(); + const float dpMin = dpc - extentsMagnitude; + const float dpMax = dpc + extentsMagnitude; + const float dpO = rayOrig.dot(rayDir); + const float boxLength = extentsMagnitude * 2.0f; + const float distToBox = PxMin(PxAbs(dpMin - dpO), PxAbs(dpMax - dpO)); + maxDist = distToBox + boxLength * 2.0f; +} + +BucketPrunerNode::BucketPrunerNode() +{ + for(PxU32 i=0;i<5;i++) + mBucketBox[i].setEmpty(); +} + +static const PxU8 gCodes[] = { 4, 4, 4, 4, 4, 3, 2, 2, + 4, 1, 0, 0, 4, 1, 0, 0, + 4, 1, 0, 0, 2, 1, 0, 0, + 3, 1, 0, 0, 2, 1, 0, 0}; + +#ifdef CAN_USE_MOVEMASK +/*static PX_FORCE_INLINE PxU32 classifyBox_x86(const BucketBox& box, const PxVec4& limits, const bool useY, const bool isCrossBucket) +{ + const Vec4V extents = AlignedLoad(&box.mExtents.x); + const Vec4V center = AlignedLoad(&box.mCenter.x); + const Vec4V plus = V4Add(extents, center); + const Vec4V minus = V4Sub(extents, center); + + Vec4V tmp; + if(useY) // PT: this is a constant so branch prediction works here + tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,1,0,1)); + else + tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,2,0,2)); + + const Vec4V comp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,2,1,3)); // oh well, nm + + const PxU32 Code = (PxU32)_mm_movemask_ps(V4IsGrtr(V4LoadA(&limits.x), comp)); + return gCodes[Code | PxU32(isCrossBucket)<<4]; +}*/ + +static PX_FORCE_INLINE PxU32 classifyBox_x86(const Vec4V boxMin, const Vec4V boxMax, const PxVec4& limits, const bool useY, const bool isCrossBucket) +{ + const Vec4V plus = boxMax; + const Vec4V minus = V4Neg(boxMin); + + Vec4V tmp; + if(useY) // PT: this is a constant so branch prediction works here + tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,1,0,1)); + else + tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,2,0,2)); + + const Vec4V comp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,2,1,3)); // oh well, nm + + const PxU32 Code = PxU32(_mm_movemask_ps(V4IsGrtr(V4LoadA(&limits.x), comp))); + return gCodes[Code | PxU32(isCrossBucket)<<4]; +} +#endif + +#ifdef CAN_USE_MOVEMASK + #if PX_DEBUG + #define USE_CLASSIFY_BOX + #endif +#else + #define USE_CLASSIFY_BOX +#endif + +#ifdef USE_CLASSIFY_BOX +static PX_FORCE_INLINE PxU32 classifyBox(const BucketBox& box, const float limitX, const float limitYZ, const PxU32 yz, const bool isCrossBucket) +{ + const bool upperPart = (box.mCenter[yz] + box.mExtents[yz])limitYZ; + const bool leftPart = (box.mCenter.x + box.mExtents.x)limitX; + + // Table-based box classification avoids many branches + const PxU32 Code = PxU32(rightPart)|(PxU32(leftPart)<<1)|(PxU32(lowerPart)<<2)|(PxU32(upperPart)<<3); + return gCodes[Code + (isCrossBucket ? 16 : 0)]; +} +#endif + +void BucketPrunerNode::classifyBoxes( float limitX, float limitYZ, + PxU32 nb, BucketBox* PX_RESTRICT boxes, const PrunerPayload* PX_RESTRICT objects, + BucketBox* PX_RESTRICT sortedBoxes, PrunerPayload* PX_RESTRICT sortedObjects, + bool isCrossBucket, PxU32 sortAxis) +{ + const PxU32 yz = PxU32(sortAxis == 1 ? 2 : 1); + + #ifdef _DEBUG + { + float prev = boxes[0].mDebugMin; + for(PxU32 i=1;i=prev); + prev = current; + } + } + #endif + + // Local (stack-based) min/max bucket bounds + PX_ALIGN(16, PxVec4) bucketBoxMin[5]; + PX_ALIGN(16, PxVec4) bucketBoxMax[5]; + { + const PxBounds3 empty = PxBounds3::empty(); + for(PxU32 i=0;i<5;i++) + { + mCounters[i] = 0; + bucketBoxMin[i] = PxVec4(empty.minimum, 0.0f); + bucketBoxMax[i] = PxVec4(empty.maximum, 0.0f); + } + } + + { +#ifdef CAN_USE_MOVEMASK + // DS: order doesn't play nice with x86 shuffles :-| + PX_ALIGN(16, PxVec4) limits(-limitX, limitX, -limitYZ, limitYZ); + const bool useY = yz==1; +#endif + // Determine in which bucket each object falls, update bucket bounds + for(PxU32 i=0;i=prev); + prev = current; + } + } + } + #endif +} + +/////////////////////////////////////////////////////////////////////////////// + +static void processChildBuckets(PxU32 nbAllocated, + BucketBox* sortedBoxesInBucket, PrunerPayload* sortedObjectsInBucket, + const BucketPrunerNode& bucket, BucketPrunerNode* PX_RESTRICT childBucket, + BucketBox* PX_RESTRICT baseBucketsBoxes, PrunerPayload* PX_RESTRICT baseBucketsObjects, + PxU32 sortAxis) +{ + PX_UNUSED(nbAllocated); + + const PxU32 yz = PxU32(sortAxis == 1 ? 2 : 1); + for(PxU32 i=0;i<5;i++) + { + const PxU32 nbInBucket = bucket.mCounters[i]; + if(!nbInBucket) + { + childBucket[i].initCounters(); + continue; + } + BucketBox* bucketsBoxes = baseBucketsBoxes + bucket.mOffsets[i]; + PrunerPayload* bucketsObjects = baseBucketsObjects + bucket.mOffsets[i]; + PX_ASSERT(nbInBucket<=nbAllocated); + + const float limitX = bucket.mBucketBox[i].mCenter.x; + const float limitYZ = bucket.mBucketBox[i].mCenter[yz]; + const bool isCrossBucket = i==4; + childBucket[i].classifyBoxes(limitX, limitYZ, nbInBucket, bucketsBoxes, bucketsObjects, + sortedBoxesInBucket, sortedObjectsInBucket, + isCrossBucket, sortAxis); + + PxMemCopy(bucketsBoxes, sortedBoxesInBucket, sizeof(BucketBox)*nbInBucket); + PxMemCopy(bucketsObjects, sortedObjectsInBucket, sizeof(PrunerPayload)*nbInBucket); + } +} + +/////////////////////////////////////////////////////////////////////////////// + +static PX_FORCE_INLINE PxU32 encodeFloat(PxU32 newPos) +{ + //we may need to check on -0 and 0 + //But it should make no practical difference. + if(newPos & PX_SIGN_BITMASK) //negative? + return ~newPos;//reverse sequence of negative numbers + else + return newPos | PX_SIGN_BITMASK; // flip sign +} + +static PX_FORCE_INLINE void computeRayLimits(float& rayMin, float& rayMax, const PxVec3& rayOrig, const PxVec3& rayDir, float maxDist, PxU32 sortAxis) +{ + const float rayOrigValue = rayOrig[sortAxis]; + const float rayDirValue = rayDir[sortAxis] * maxDist; + rayMin = PxMin(rayOrigValue, rayOrigValue + rayDirValue); + rayMax = PxMax(rayOrigValue, rayOrigValue + rayDirValue); +} + +static PX_FORCE_INLINE void computeRayLimits(float& rayMin, float& rayMax, const PxVec3& rayOrig, const PxVec3& rayDir, float maxDist, const PxVec3& inflate, PxU32 sortAxis) +{ + const float inflateValue = inflate[sortAxis]; + const float rayOrigValue = rayOrig[sortAxis]; + const float rayDirValue = rayDir[sortAxis] * maxDist; + rayMin = PxMin(rayOrigValue, rayOrigValue + rayDirValue) - inflateValue; + rayMax = PxMax(rayOrigValue, rayOrigValue + rayDirValue) + inflateValue; +} + +static PX_FORCE_INLINE void encodeBoxMinMax(BucketBox& box, const PxU32 axis) +{ + const float min = box.mCenter[axis] - box.mExtents[axis]; + const float max = box.mCenter[axis] + box.mExtents[axis]; + + const PxU32* binaryMin = reinterpret_cast(&min); + const PxU32* binaryMax = reinterpret_cast(&max); + box.mData0 = encodeFloat(binaryMin[0]); + box.mData1 = encodeFloat(binaryMax[0]); +} + +/////////////////////////////////////////////////////////////////////////////// + +BucketPrunerCore::BucketPrunerCore(bool externalMemory) : + mCoreNbObjects (0), + mCoreCapacity (0), + mCoreBoxes (NULL), + mCoreObjects (NULL), + mCoreRemap (NULL), + mSortedWorldBoxes (NULL), + mSortedObjects (NULL), + mNbFree (0), + mSortedNb (0), + mSortedCapacity (0), + mSortAxis (0), + mDirty (true), + mOwnMemory (!externalMemory) +{ + mGlobalBox.setEmpty(); + + mLevel1.initCounters(); + + for(PxU32 i=0;i<5;i++) + mLevel2[i].initCounters(); + for(PxU32 j=0;j<5;j++) + for(PxU32 i=0;i<5;i++) + mLevel3[j][i].initCounters(); +} + +BucketPrunerCore::~BucketPrunerCore() +{ + release(); +} + +void BucketPrunerCore::release() +{ + mDirty = true; + mCoreNbObjects = 0; + + mCoreCapacity = 0; + if(mOwnMemory) + { + PX_FREE_AND_RESET(mCoreBoxes); + PX_FREE_AND_RESET(mCoreObjects); + PX_FREE_AND_RESET(mCoreRemap); + } + + PX_FREE_AND_RESET(mSortedWorldBoxes); + PX_FREE_AND_RESET(mSortedObjects); + mSortedNb = 0; + mSortedCapacity = 0; + + mNbFree = 0; +#ifdef USE_REGULAR_HASH_MAP + mMap.clear(); +#else + mMap.purge(); +#endif +} + +void BucketPrunerCore::setExternalMemory(PxU32 nbObjects, PxBounds3* boxes, PrunerPayload* objects) +{ + PX_ASSERT(!mOwnMemory); + mCoreNbObjects = nbObjects; + mCoreBoxes = boxes; + mCoreObjects = objects; + mCoreRemap = NULL; +} + +void BucketPrunerCore::allocateSortedMemory(PxU32 nb) +{ + mSortedNb = nb; + if(nb<=mSortedCapacity && (nb>=mSortedCapacity/2)) + return; + + const PxU32 capacity = Ps::nextPowerOfTwo(nb); + mSortedCapacity = capacity; + + PxU32 bytesNeededForBoxes = capacity*sizeof(BucketBox); + bytesNeededForBoxes = ALIGN16(bytesNeededForBoxes); + + PxU32 bytesNeededForObjects = capacity*sizeof(PrunerPayload); + bytesNeededForObjects = ALIGN16(bytesNeededForObjects); + + PX_FREE(mSortedObjects); + PX_FREE(mSortedWorldBoxes); + mSortedWorldBoxes = reinterpret_cast(PX_ALLOC(bytesNeededForBoxes, "BucketPruner")); + mSortedObjects = reinterpret_cast(PX_ALLOC(bytesNeededForObjects, "BucketPruner")); + PX_ASSERT(!(size_t(mSortedWorldBoxes)&15)); + PX_ASSERT(!(size_t(mSortedObjects)&15)); +} + +/////////////////////////////////////////////////////////////////////////////// + +void BucketPrunerCore::resizeCore() +{ + const PxU32 capacity = mCoreCapacity ? mCoreCapacity*2 : 32; + mCoreCapacity = capacity; + + const PxU32 bytesNeededForBoxes = capacity*sizeof(PxBounds3); + const PxU32 bytesNeededForObjects = capacity*sizeof(PrunerPayload); + const PxU32 bytesNeededForRemap = capacity*sizeof(PxU32); + + PxBounds3* newCoreBoxes = reinterpret_cast(PX_ALLOC(bytesNeededForBoxes, "BucketPruner")); + PrunerPayload* newCoreObjects = reinterpret_cast(PX_ALLOC(bytesNeededForObjects, "BucketPruner")); + PxU32* newCoreRemap = reinterpret_cast(PX_ALLOC(bytesNeededForRemap, "BucketPruner")); + if(mCoreBoxes) + { + PxMemCopy(newCoreBoxes, mCoreBoxes, mCoreNbObjects*sizeof(PxBounds3)); + PX_FREE(mCoreBoxes); + } + if(mCoreObjects) + { + PxMemCopy(newCoreObjects, mCoreObjects, mCoreNbObjects*sizeof(PrunerPayload)); + PX_FREE(mCoreObjects); + } + if(mCoreRemap) + { + PxMemCopy(newCoreRemap, mCoreRemap, mCoreNbObjects*sizeof(PxU32)); + PX_FREE(mCoreRemap); + } + mCoreBoxes = newCoreBoxes; + mCoreObjects = newCoreObjects; + mCoreRemap = newCoreRemap; +} + +PX_FORCE_INLINE void BucketPrunerCore::addObjectInternal(const PrunerPayload& object, const PxBounds3& worldAABB, PxU32 timeStamp) +{ + if(mCoreNbObjects==mCoreCapacity) + resizeCore(); + + const PxU32 index = mCoreNbObjects++; + mCoreObjects[index] = object; + mCoreBoxes[index] = worldAABB; // PT: TODO: check assembly here + mCoreRemap[index] = 0xffffffff; + + // Objects are only inserted into the map once they're part of the main/core arrays. +#ifdef USE_REGULAR_HASH_MAP + bool ok = mMap.insert(object, BucketPrunerPair(index, timeStamp)); +#else + BucketPrunerPair* ok = mMap.addPair(object, index, timeStamp); +#endif + PX_UNUSED(ok); + PX_ASSERT(ok); +} + +bool BucketPrunerCore::addObject(const PrunerPayload& object, const PxBounds3& worldAABB, PxU32 timeStamp) +{ +/* + We should probably use a bigger Payload struct here, which would also contains the external handle. + (EDIT: we can't even do that, because of the setExternalMemory function) + When asked to update/remove an object it would be O(n) to find the proper object in the mSortedObjects array. + + - + + For removing it we can simply empty the corresponding box, and the object will never be returned from queries. + Maybe this isn't even true, since boxes are sorted along one axis. So marking a box as empty could break the code relying on a sorted order. + An alternative is to mark the external handle as invalid, and ignore the object when a hit is found. + + (EDIT: the sorting is now tested via data0/data1 anyway so we could mark the box as empty without breaking this) + + - + + For updating an object we would need to keep the (sub) array sorted (not the whole thing, only the array within a bucket). + We don't know the range (what part of the array maps to our bucket) but we may have the bucket ID somewhere? If we'd have this + we could parse the array left/right and resort just the right boxes. If we don't have this we may be able to "quickly" find the + range by traversing the tree, looking for the proper bucket. In any case I don't think there's a mapping to update within a bucket, + unlike in SAP or MBP. So we should be able to shuffle a bucket without having to update anything. For example there's no mapping + between the Core array and the Sorted array. It's a shame in a way because we'd need one, but it's not there - and in fact I think + we can free the Core array once Sorted is created, we don't need it at all. + + If we don't want to re-sort the full bucket we can just mark it as dirty and ignore the sort-based early exits in the queries. Then we + can incrementally resort it over N frames or something. + + This only works if the updated object remains in the same bucket though. If it moves to another bucket it becomes tempting to just remove + the object and re-insert it. + + - + + Now for adding an object, we can first have a "free pruner" and do the 16 next entries brute-force. Rebuilding every 16 objects might + give a good speedup already. Otherwise we need to do something more complicated. +*/ + + PX_ASSERT(mOwnMemory); + PX_ASSERT(!mDirty || !mNbFree); + if(!mDirty) + { + // In this path the structure is marked as valid. We do not want to invalidate it for each new object... + if(mNbFreesecond.mCoreIndex; + timeStamp = removedEntry->second.mTimeStamp; +#else + PxU32 coreIndex; // This is the object's index in the core arrays. + if(mMap.removePair(object, coreIndex, timeStamp)) + { +#endif + // In this codepath, the object we want to remove exists in the core arrays. + + // We will need to remove it from both the core arrays & the sorted arrays. + const PxU32 sortedIndex = mCoreRemap[coreIndex]; // This is the object's index in the sorted arrays. + +#ifdef USE_REGULAR_HASH_MAP + bool status = mMap.erase(object); + PX_ASSERT(status); + PX_UNUSED(status); +#endif + + // First let's deal with the core arrays + mCoreNbObjects--; + if(coreIndex!=mCoreNbObjects) + { + // If it wasn't the last object in the array, close the gaps as usual + const PrunerPayload& movedObject = mCoreObjects[mCoreNbObjects]; + mCoreBoxes[coreIndex] = mCoreBoxes[mCoreNbObjects]; + mCoreObjects[coreIndex] = movedObject; + mCoreRemap[coreIndex] = mCoreRemap[mCoreNbObjects]; + + // Since we just moved the last object, its index in the core arrays has changed. + // We must reflect this change in the map. +#ifdef USE_REGULAR_HASH_MAP + BucketPrunerMap::Entry* movedEntry = const_cast(mMap.find(movedObject)); + PX_ASSERT(movedEntry->second.mCoreIndex==mCoreNbObjects); + movedEntry->second.mCoreIndex = coreIndex; +#else + BucketPrunerPair* movedEntry = const_cast(mMap.findPair(movedObject)); + PX_ASSERT(movedEntry->mCoreIndex==mCoreNbObjects); + movedEntry->mCoreIndex = coreIndex; +#endif + } + + // Now, let's deal with the sorted arrays. + // If the structure is dirty, the sorted arrays will be rebuilt from scratch so there's no need to + // update them right now. + if(!mDirty) + { + // If the structure is valid, we want to keep it this way to avoid rebuilding sorted arrays after + // each removal. We can't "close the gaps" easily here because order of objects in the arrays matters. + + // Instead we just invalidate the object by setting its bounding box as empty. + // Queries against empty boxes will never return a hit, so this effectively "removes" the object + // from any subsequent query results. Sorted arrays now contain a "disabled" object, until next build. + + // Invalidating the box does not invalidate the sorting, since it's now captured in mData0/mData1. + // That is, mData0/mData1 keep their previous integer-encoded values, as if the box/object was still here. + PxBounds3 empty; + empty.setEmpty(); + mSortedWorldBoxes[sortedIndex].mCenter = empty.getCenter(); + mSortedWorldBoxes[sortedIndex].mExtents = empty.getExtents(); + // Note that we don't touch mSortedObjects here. We could, but this is not necessary. + } + return true; + } + + // Here, the object we want to remove exists in the free array. So we just parse it. + for(PxU32 i=0;i(mMap.find(movedObject)); + PX_ASSERT(movedEntry->second.mCoreIndex==coreNbObjects); + movedEntry->second.mCoreIndex = coreIndex; +#else + BucketPrunerPair* movedEntry = const_cast(mMap.findPair(movedObject)); + PX_ASSERT(movedEntry->mCoreIndex==coreNbObjects); + movedEntry->mCoreIndex = coreIndex; +#endif + } + + nbRemoved++; +#ifdef USE_REGULAR_HASH_MAP + bool status = mMap.erase(p.first); + PX_ASSERT(status); + PX_UNUSED(status); +#else + const PxU32 hashValue = hash(p.mPayload) & mMap.mMask; + mMap.removePairInternal(p.mPayload, hashValue, i); +#endif + nbActivePairs--; + } + else i++; + } + mCoreNbObjects = coreNbObjects; + +#ifdef USE_REGULAR_HASH_MAP +#else + mMap.shrinkMemory(); +#endif + } + + // PT: ...then we look in the 'free' array + PxU32 i=0; + while(i0); + Vec4V mergedMinV = V4LoadU(&boxes[nb-1].minimum.x); + Vec4V mergedMaxV = Vec4V_From_Vec3V(V3LoadU(&boxes[nb-1].maximum.x)); + for(PxU32 i=0;i(sortedObjects); + for(PxU32 i=0;i + PX_CUDA_CALLABLE PX_FORCE_INLINE void tswap(T& x, T& y) + { + T tmp = x; + x = y; + y = tmp; + } + +/* PX_FORCE_INLINE __m128 DotV(const __m128 a, const __m128 b) + { + const __m128 dot1 = _mm_mul_ps(a, b); + const __m128 shuf1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(0,0,0,0))); + const __m128 shuf2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(1,1,1,1))); + const __m128 shuf3 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(2,2,2,2))); + return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3); + }*/ + +// PT: hmmm, by construction, isn't the order always the same for all bucket pruners? +// => maybe not because the bucket boxes are still around the merged aabbs, not around the bucket +// Still we could do something here +static /*PX_FORCE_INLINE*/ PxU32 sort(const BucketPrunerNode& parent, const PxVec3& rayDir) +{ + const PxU32 totalCount = parent.mCounters[0]+parent.mCounters[1]+parent.mCounters[2]+parent.mCounters[3]+parent.mCounters[4]; + if(totalCount(dp); + const PxU32* values = PxUnionCast(dp); + + PxU32 value0 = values[0]; + PxU32 value1 = values[1]; + PxU32 value2 = values[2]; + PxU32 value3 = values[3]; + PxU32 value4 = values[4]; + + for(PxU32 j=0;j<5-1;j++) + { + if(value1(dp); + +// const PxU32 mask = ~7U; + const PxU32 mask = 0x7ffffff8; + PxU32 value0 = (values[0]&mask); + PxU32 value1 = (values[1]&mask)|1; + PxU32 value2 = (values[2]&mask)|2; + PxU32 value3 = (values[3]&mask)|3; + PxU32 value4 = (values[4]&mask)|4; + +#define SORT_BLOCK \ + if(value1(PX_ALLOC(nb*sizeof(size_t), "")); +for(PxU32 i=0;iLOCAL_SIZE) + { + tempObjects = reinterpret_cast(PX_ALLOC(sizeof(PrunerPayload)*nb, "BucketPruner")); + tempBoxes = reinterpret_cast(PX_ALLOC(nb*sizeof(BucketBox), "BucketPruner")); + } + else + { + tempObjects = localTempObjects; + tempBoxes = localTempBoxes; + } + + mSortAxis = sortBoxes(nb, mCoreBoxes, mCoreObjects, mGlobalBox, tempBoxes, tempObjects); + + PX_ASSERT(mSortAxis); + + allocateSortedMemory(nb); + BucketBox* sortedBoxes = mSortedWorldBoxes; + PrunerPayload* sortedObjects = mSortedObjects; + + const PxU32 yz = PxU32(mSortAxis == 1 ? 2 : 1); + const float limitX = mGlobalBox.mCenter.x; + const float limitYZ = mGlobalBox.mCenter[yz]; + mLevel1.classifyBoxes(limitX, limitYZ, nb, tempBoxes, tempObjects, + sortedBoxes, sortedObjects, + false, mSortAxis); + + processChildBuckets(nb, tempBoxes, tempObjects, + mLevel1, mLevel2, mSortedWorldBoxes, mSortedObjects, + mSortAxis); + + for(PxU32 j=0;j<5;j++) + processChildBuckets(nb, tempBoxes, tempObjects, + mLevel2[j], mLevel3[j], mSortedWorldBoxes + mLevel1.mOffsets[j], mSortedObjects + mLevel1.mOffsets[j], + mSortAxis); + + { + for(PxU32 i=0;iLOCAL_SIZE) + { + PX_FREE(tempBoxes); + PX_FREE(tempObjects); + } + +for(PxU32 i=0;i(&MaskI)), DataV); + _mm_store_ps(&rayParams->mData.x, DataV); + _mm_store_ps(&rayParams->mData2.x, Data2V); + _mm_store_ps(&rayParams->mFDir.x, FDirV); + #else + const PxVec3 data = 0.5f * rayDir * maxDist; + rayParams->mData = data; + rayParams->mData2 = rayOrig + data; + rayParams->mFDir.x = PxAbs(data.x); + rayParams->mFDir.y = PxAbs(data.y); + rayParams->mFDir.z = PxAbs(data.z); + #endif + } + + template + static PX_FORCE_INLINE IntBool _segmentAABB(const BucketBox& box, const RayParams* PX_RESTRICT params) + { + #ifdef USE_SIMD + const PxU32 maskI = 0x7fffffff; + const __m128 fdirV = _mm_load_ps(¶ms->mFDir.x); +// #ifdef _DEBUG + const __m128 extentsV = inflateT ? _mm_add_ps(_mm_loadu_ps(&box.mExtents.x), _mm_load_ps(¶ms->mInflate.x)) : _mm_loadu_ps(&box.mExtents.x); + const __m128 DV = _mm_sub_ps(_mm_load_ps(¶ms->mData2.x), _mm_loadu_ps(&box.mCenter.x)); +/* #else + const __m128 extentsV = inflateT ? _mm_add_ps(_mm_load_ps(&box.mExtents.x), _mm_load_ps(¶ms->mInflate.x)) : _mm_load_ps(&box.mExtents.x); + const __m128 DV = _mm_sub_ps(_mm_load_ps(¶ms->mData2.x), _mm_load_ps(&box.mCenter.x)); + #endif*/ + __m128 absDV = _mm_and_ps(DV, _mm_load1_ps(reinterpret_cast(&maskI))); + absDV = _mm_cmpgt_ps(absDV, _mm_add_ps(extentsV, fdirV)); + const PxU32 test = PxU32(_mm_movemask_ps(absDV)); + if(test&7) + return 0; + + const __m128 dataZYX_V = _mm_load_ps(¶ms->mData.x); + const __m128 dataXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dataZYX_V), _MM_SHUFFLE(3,0,2,1))); + const __m128 DXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(DV), _MM_SHUFFLE(3,0,2,1))); + const __m128 fV = _mm_sub_ps(_mm_mul_ps(dataZYX_V, DXZY_V), _mm_mul_ps(dataXZY_V, DV)); + + const __m128 fdirZYX_V = _mm_load_ps(¶ms->mFDir.x); + const __m128 fdirXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(fdirZYX_V), _MM_SHUFFLE(3,0,2,1))); + const __m128 extentsXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(extentsV), _MM_SHUFFLE(3,0,2,1))); + const __m128 fg = _mm_add_ps(_mm_mul_ps(extentsV, fdirXZY_V), _mm_mul_ps(extentsXZY_V, fdirZYX_V)); + + __m128 absfV = _mm_and_ps(fV, _mm_load1_ps(reinterpret_cast(&maskI))); + absfV = _mm_cmpgt_ps(absfV, fg); + const PxU32 test2 = PxU32(_mm_movemask_ps(absfV)); + if(test2&7) + return 0; + return 1; + #else + const float boxExtentsx = inflateT ? box.mExtents.x + params->mInflate.x : box.mExtents.x; + const float Dx = params->mData2.x - box.mCenter.x; if(fabsf(Dx) > boxExtentsx + params->mFDir.x) return IntFalse; + + const float boxExtentsz = inflateT ? box.mExtents.z + params->mInflate.z : box.mExtents.z; + const float Dz = params->mData2.z - box.mCenter.z; if(fabsf(Dz) > boxExtentsz + params->mFDir.z) return IntFalse; + + const float boxExtentsy = inflateT ? box.mExtents.y + params->mInflate.y : box.mExtents.y; + const float Dy = params->mData2.y - box.mCenter.y; if(fabsf(Dy) > boxExtentsy + params->mFDir.y) return IntFalse; + + float f; + f = params->mData.y * Dz - params->mData.z * Dy; if(fabsf(f) > boxExtentsy*params->mFDir.z + boxExtentsz*params->mFDir.y) return IntFalse; + f = params->mData.z * Dx - params->mData.x * Dz; if(fabsf(f) > boxExtentsx*params->mFDir.z + boxExtentsz*params->mFDir.x) return IntFalse; + f = params->mData.x * Dy - params->mData.y * Dx; if(fabsf(f) > boxExtentsx*params->mFDir.y + boxExtentsy*params->mFDir.x) return IntFalse; + return IntTrue; + #endif + } +#else + #include "SqPrunerTestsSIMD.h" + + typedef RayAABBTest BPRayAABBTest; + +template +static PX_FORCE_INLINE IntBool _segmentAABB(const BucketBox& box, const BPRayAABBTest& test) +{ + return static_cast(test.check(V3LoadU(box.mCenter), V3LoadU(box.mExtents))); +} + +/*static PX_FORCE_INLINE IntBool _segmentAABB(const BucketBox& box, const BPRayAABBTest& test, PxU32 rayMinLimitX, PxU32 rayMaxLimitX) +{ + if(rayMinLimitX>box.mData1) + return 0; + if(rayMaxLimitX +static PxAgain processBucket( + PxU32 nb, const BucketBox* PX_RESTRICT baseBoxes, PrunerPayload* PX_RESTRICT baseObjects, + PxU32 offset, PxU32 totalAllocated, + const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, +#ifdef CAN_USE_MOVEMASK + RayParams* PX_RESTRICT rayParams, +#else + BPRayAABBTest& test, const PxVec3& inflate, +#endif + PrunerCallback& pcb, PxU32& _rayMinLimitInt, PxU32& _rayMaxLimitInt, PxU32 sortAxis) +{ + PX_UNUSED(totalAllocated); + + const BucketBox* PX_RESTRICT _boxes = baseBoxes + offset; + PrunerPayload* PX_RESTRICT _objects = baseObjects + offset; + + PxU32 rayMinLimitInt = _rayMinLimitInt; + PxU32 rayMaxLimitInt = _rayMaxLimitInt; + + const BucketBox* last = _boxes + nb; + + while(_boxes!=last) + { + const BucketBox& currentBox = *_boxes++; + PrunerPayload* currentObject = _objects++; + + if(currentBox.mData1rayMaxLimitInt) + goto Exit; + +#ifdef CAN_USE_MOVEMASK + if(!_segmentAABB(currentBox, rayParams)) + continue; +#else + if(!_segmentAABB(currentBox, test)) + continue; +#endif + + const float MaxDist = maxDist; + const PxAgain again = pcb.invoke(maxDist, *currentObject); + if(!again) + return false; + if(maxDist < MaxDist) + { + float rayMinLimit, rayMaxLimit; +#ifdef CAN_USE_MOVEMASK + if(inflateT) + computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, rayParams->mInflate, sortAxis); + else + computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis); + + precomputeRayData(rayParams, rayOrig, rayDir, maxDist); +#else + if(inflateT) + computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, inflate, sortAxis); + else + computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis); + + test.setDistance(maxDist); +#endif + const PxU32* binaryMinLimit = reinterpret_cast(&rayMinLimit); + const PxU32* binaryMaxLimit = reinterpret_cast(&rayMaxLimit); + rayMinLimitInt = encodeFloat(binaryMinLimit[0]); + rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]); + } + } +Exit: + + _rayMinLimitInt = rayMinLimitInt; + _rayMaxLimitInt = rayMaxLimitInt; + return true; +} + +#ifdef NODE_SORT +static PxU32 computeDirMask(const PxVec3& dir) +{ + const PxU32* binary = reinterpret_cast(&dir.x); + const PxU32 X = (binary[0])>>31; + const PxU32 Y = (binary[1])>>31; + const PxU32 Z = (binary[2])>>31; + return Z|(Y<<1)|(X<<2); +} +#endif + +template +static PxAgain stab(const BucketPrunerCore& core, PrunerCallback& pcb, const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, const PxVec3 inflate) +{ + const PxU32 nb = core.mSortedNb; + if(!nb && !core.mNbFree) + return true; + + if(maxDist==PX_MAX_F32) + { + /*const*/ PxVec3 boxMin = core.mGlobalBox.getMin() - inflate; + /*const*/ PxVec3 boxMax = core.mGlobalBox.getMax() + inflate; + + if(core.mNbFree) + { + // TODO: optimize this + PxBounds3 freeGlobalBounds; + freeGlobalBounds.setEmpty(); + for(PxU32 i=0;i(tmp, &rayParams)) +#else + if(_segmentAABB(tmp, test)) +#endif + { + if(!pcb.invoke(maxDist, core.mFreeObjects[i])) + return false; + } + } + + if(!nb) + return true; + +#ifdef CAN_USE_MOVEMASK + if(!_segmentAABB(core.mGlobalBox, &rayParams)) + return true; +#else + if(!_segmentAABB(core.mGlobalBox, test)) + return true; +#endif + + const PxU32 sortAxis = core.mSortAxis; + float rayMinLimit, rayMaxLimit; + if(inflateT) + computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, inflate, sortAxis); + else + computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis); + + const PxU32* binaryMinLimit = reinterpret_cast(&rayMinLimit); + const PxU32* binaryMaxLimit = reinterpret_cast(&rayMaxLimit); + PxU32 rayMinLimitInt = encodeFloat(binaryMinLimit[0]); + PxU32 rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]); +/* +float rayMinLimitX, rayMaxLimitX; +if(inflateT) + computeRayLimits(rayMinLimitX, rayMaxLimitX, rayOrig, rayDir, maxDist, inflate, 0); +else + computeRayLimits(rayMinLimitX, rayMaxLimitX, rayOrig, rayDir, maxDist, 0); + +PxU32 rayMinLimitIntX = encodeFloat(PX_IR(rayMinLimitX)); +PxU32 rayMaxLimitIntX = encodeFloat(PX_IR(rayMaxLimitX)); +*/ + + float currentDist = maxDist; + +#ifdef NODE_SORT + const PxU32 dirIndex = computeDirMask(rayDir); + PxU32 orderi = core.mLevel1.mOrder[dirIndex]; +// PxU32 orderi = sort(core.mLevel1, rayDir); + + for(PxU32 i_=0;i_<5;i_++) + { + const PxU32 i = orderi&7; orderi>>=3; +#else + for(PxU32 i=0;i<5;i++) + { +#endif + +#ifdef CAN_USE_MOVEMASK + if(core.mLevel1.mCounters[i] && _segmentAABB(core.mLevel1.mBucketBox[i], &rayParams)) +#else + if(core.mLevel1.mCounters[i] && _segmentAABB(core.mLevel1.mBucketBox[i], test)) +// if(core.mLevel1.mCounters[i] && _segmentAABB(core.mLevel1.mBucketBox[i], test, rayMinLimitIntX, rayMaxLimitIntX)) +#endif + { + +#ifdef NODE_SORT + PxU32 orderj = core.mLevel2[i].mOrder[dirIndex]; +// PxU32 orderj = sort(core.mLevel2[i], rayDir); + + for(PxU32 j_=0;j_<5;j_++) + { + const PxU32 j = orderj&7; orderj>>=3; +#else + for(PxU32 j=0;j<5;j++) + { +#endif + +#ifdef CAN_USE_MOVEMASK + if(core.mLevel2[i].mCounters[j] && _segmentAABB(core.mLevel2[i].mBucketBox[j], &rayParams)) +#else + if(core.mLevel2[i].mCounters[j] && _segmentAABB(core.mLevel2[i].mBucketBox[j], test)) +// if(core.mLevel2[i].mCounters[j] && _segmentAABB(core.mLevel2[i].mBucketBox[j], test, rayMinLimitIntX, rayMaxLimitIntX)) +#endif + { + const BucketPrunerNode& parent = core.mLevel3[i][j]; + const PxU32 parentOffset = core.mLevel1.mOffsets[i] + core.mLevel2[i].mOffsets[j]; + +#ifdef NODE_SORT + PxU32 orderk = parent.mOrder[dirIndex]; +// PxU32 orderk = sort(parent, rayDir); + + for(PxU32 k_=0;k_<5;k_++) + { + const PxU32 k = orderk&7; orderk>>=3; +#else + for(PxU32 k=0;k<5;k++) + { +#endif + const PxU32 nbInBucket = parent.mCounters[k]; +#ifdef CAN_USE_MOVEMASK + if(nbInBucket && _segmentAABB(parent.mBucketBox[k], &rayParams)) +#else + if(nbInBucket && _segmentAABB(parent.mBucketBox[k], test)) +// if(nbInBucket && _segmentAABB(parent.mBucketBox[k], test, rayMinLimitIntX, rayMaxLimitIntX)) +#endif + { + const PxU32 offset = parentOffset + parent.mOffsets[k]; + const PxAgain again = processBucket( nbInBucket, core.mSortedWorldBoxes, core.mSortedObjects, + offset, core.mSortedNb, + rayOrig, rayDir, currentDist, +#ifdef CAN_USE_MOVEMASK + &rayParams, +#else + test, inflate, +#endif + pcb, + rayMinLimitInt, rayMaxLimitInt, + sortAxis); + if(!again) + return false; + } + } + } + } + } + } + + maxDist = currentDist; + return true; +} + +PxAgain BucketPrunerCore::raycast(const PxVec3& origin, const PxVec3& unitDir, PxReal& inOutDistance, PrunerCallback& pcb) const +{ + return ::stab<0>(*this, pcb, origin, unitDir, inOutDistance, PxVec3(0.0f)); +} + +PxAgain BucketPrunerCore::sweep(const ShapeData& queryVolume, const PxVec3& unitDir, PxReal& inOutDistance, PrunerCallback& pcb) const +{ + const PxVec3 extents = queryVolume.getPrunerInflatedWorldAABB().getExtents(); + return ::stab<1>(*this, pcb, queryVolume.getPrunerInflatedWorldAABB().getCenter(), unitDir, inOutDistance, extents); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template +static PX_FORCE_INLINE bool processBucket( PxU32 nb, const BucketBox* PX_RESTRICT baseBoxes, PrunerPayload* PX_RESTRICT baseObjects, + PxU32 offset, PxU32 totalAllocated, + const Test& test, PrunerCallback& pcb, + PxU32 minLimitInt, PxU32 maxLimitInt) +{ + PX_UNUSED(totalAllocated); + + const BucketBox* PX_RESTRICT boxes = baseBoxes + offset; + PrunerPayload* PX_RESTRICT objects = baseObjects + offset; + + while(nb--) + { + const BucketBox& currentBox = *boxes++; + PrunerPayload* currentObject = objects++; + + if(currentBox.mData1maxLimitInt) + { + if(doAssert) + PX_ASSERT(!test(currentBox)); + return true; + } + + if(test(currentBox)) + { + PxReal dist = -1.0f; // no distance for overlaps + if(!pcb.invoke(dist, *currentObject)) + return false; + } + } + return true; +} + +template +class BucketPrunerOverlapTraversal +{ +public: + PX_FORCE_INLINE BucketPrunerOverlapTraversal() {} + + /*PX_FORCE_INLINE*/ bool operator()(const BucketPrunerCore& core, const Test& test, PrunerCallback& pcb, const PxBounds3& cullBox) const + { + for(PxU32 i=0;i(&boxMinLimit); + const PxU32* binaryMaxLimit = reinterpret_cast(&boxMaxLimit); + const PxU32 rayMinLimitInt = encodeFloat(binaryMinLimit[0]); + const PxU32 rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]); + + for(PxU32 i=0;i<5;i++) + { + if(core.mLevel1.mCounters[i] && test(core.mLevel1.mBucketBox[i])) + { + for(PxU32 j=0;j<5;j++) + { + if(core.mLevel2[i].mCounters[j] && test(core.mLevel2[i].mBucketBox[j])) + { + for(PxU32 k=0;k<5;k++) + { + const PxU32 nbInBucket = core.mLevel3[i][j].mCounters[k]; + if(nbInBucket && test(core.mLevel3[i][j].mBucketBox[k])) + { + const PxU32 offset = core.mLevel1.mOffsets[i] + core.mLevel2[i].mOffsets[j] + core.mLevel3[i][j].mOffsets[k]; + if(!processBucket(nbInBucket, core.mSortedWorldBoxes, core.mSortedObjects, + offset, core.mSortedNb, test, pcb, rayMinLimitInt, rayMaxLimitInt)) + return false; + } + } + } + } + } + } + return true; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +#ifdef CAN_USE_MOVEMASK +PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32((moveMask & 0x7) == (0x7)); +} +#endif + +#ifdef USE_SIMD +struct SphereAABBTest_SIMD +{ + PX_FORCE_INLINE SphereAABBTest_SIMD(const Gu::Sphere& sphere) : + #ifdef CAN_USE_MOVEMASK + mCenter (V4LoadU(&sphere.center.x)), + #else + mCenter (V3LoadU(sphere.center)), + #endif + mRadius2(FLoad(sphere.radius * sphere.radius)) + {} + + PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const + { + #ifdef CAN_USE_MOVEMASK + const Vec4V boxCenter = AlignedLoad(&box.mCenter.x); + const Vec4V boxExtents = AlignedLoad(&box.mExtents.x); + // + const Vec4V offset = V4Sub(mCenter, boxCenter); + const Vec4V closest = V4Clamp(offset, V4Neg(boxExtents), boxExtents); + const Vec4V d = V4Sub(offset, closest); + + const FloatV dot = V4Dot3(d,d); + return Ps::IntBool(BAllTrue3_R(FIsGrtrOrEq(mRadius2, dot))); + #else + const Vec3V boxCenter = V3LoadU(box.mCenter); + const Vec3V boxExtents = V3LoadU(box.mExtents); + // + const Vec3V offset = V3Sub(mCenter, boxCenter); + const Vec3V closest = V3Clamp(offset, V3Neg(boxExtents), boxExtents); + const Vec3V d = V3Sub(offset, closest); + return Ps::IntBool(BAllEqTTTT(FIsGrtrOrEq(mRadius2, V3Dot(d, d)))); + #endif + } + + PX_FORCE_INLINE Ps::IntBool operator()(const PxBounds3& bounds) const + { + BucketBox tmp; + tmp.mCenter = bounds.getCenter(); + tmp.mExtents = bounds.getExtents(); + return (*this)(tmp); + } + +private: + SphereAABBTest_SIMD& operator=(const SphereAABBTest_SIMD&); + #ifdef CAN_USE_MOVEMASK + const Vec4V mCenter; + #else + const Vec3V mCenter; + #endif + const FloatV mRadius2; +}; +#else +struct SphereAABBTest_Scalar +{ + PX_FORCE_INLINE SphereAABBTest_Scalar(const Gu::Sphere& sphere) : + mCenter (sphere.center), + mRadius2(sphere.radius * sphere.radius) + {} + + PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const + { + const PxVec3 minimum = box.getMin(); + const PxVec3 maximum = box.getMax(); + + float d = 0.0f; + + //find the square of the distance + //from the sphere to the box + for(PxU32 i=0;i<3;i++) + { + if(mCenter[i]maximum[i]) + { + const float s = mCenter[i] - maximum[i]; + d += s*s; + } + } + return d <= mRadius2; + } + +private: + SphereAABBTest_Scalar& operator=(const SphereAABBTest_Scalar&); + const PxVec3 mCenter; + float mRadius2; +}; +#endif + +#ifdef USE_SIMD +typedef SphereAABBTest_SIMD BucketPrunerSphereAABBTest; +#else +typedef SphereAABBTest_Scalar BucketPrunerSphereAABBTest; +#endif + +/////////////////////////////////////////////////////////////////////////////// + +struct BucketPrunerAABBAABBTest +{ + PX_FORCE_INLINE BucketPrunerAABBAABBTest(const PxBounds3& queryBox) : mBox(queryBox) {} + + PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const + { + // PT: we don't use PxBounds3::intersects() because isValid() asserts on our empty boxes! + const PxVec3 bucketMin = box.getMin(); + const PxVec3 bucketMax = box.getMax(); + return !(mBox.minimum.x > bucketMax.x || bucketMin.x > mBox.maximum.x || + mBox.minimum.y > bucketMax.y || bucketMin.y > mBox.maximum.y || + mBox.minimum.z > bucketMax.z || bucketMin.z > mBox.maximum.z); + } + + PX_FORCE_INLINE Ps::IntBool operator()(const PxBounds3& bounds) const + { + // PT: we don't use PxBounds3::intersects() because isValid() asserts on our empty boxes! + const PxVec3& bucketMin = bounds.minimum; + const PxVec3& bucketMax = bounds.maximum; + return !(mBox.minimum.x > bucketMax.x || bucketMin.x > mBox.maximum.x || + mBox.minimum.y > bucketMax.y || bucketMin.y > mBox.maximum.y || + mBox.minimum.z > bucketMax.z || bucketMin.z > mBox.maximum.z); + } +private: + BucketPrunerAABBAABBTest& operator=(const BucketPrunerAABBAABBTest&); + const PxBounds3 mBox; +}; + +/*struct BucketPrunerAABBAABBTest_SIMD +{ + PX_FORCE_INLINE BucketPrunerAABBAABBTest_SIMD(const PxBounds3& b) + : mCenter(V3LoadU(b.getCenter())) + , mExtents(V3LoadU(b.getExtents())) + {} + + PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const + { + return V3AllGrtrOrEq(V3Add(mExtents, AlignedLoad(&box.mExtents.x)), V3Abs(V3Sub(AlignedLoad(&box.mCenter.x), mCenter))); + } +private: + BucketPrunerAABBAABBTest_SIMD& operator=(const BucketPrunerAABBAABBTest_SIMD&); + const Vec3V mCenter, mExtents; +};*/ + +/////////////////////////////////////////////////////////////////////////////// + +#ifdef USE_SIMD +struct OBBAABBTest_SIMD +{ + OBBAABBTest_SIMD(const PxMat33& rotation, const PxVec3& translation, const PxVec3& extents) + { + const Vec3V eps = V3Load(1e-6f); + + mT = V3LoadU(translation); + mExtents = V3LoadU(extents); + + // storing the transpose matrices yields a simpler SIMD test + mRT = Mat33V_From_PxMat33(rotation.getTranspose()); + mART = Mat33V(V3Add(V3Abs(mRT.col0), eps), V3Add(V3Abs(mRT.col1), eps), V3Add(V3Abs(mRT.col2), eps)); + mBB_xyz = M33TrnspsMulV3(mART, mExtents); + +/* if(fullTest) + { + const Vec3V eYZX = V3PermYZX(mExtents), eZXY = V3PermZXY(mExtents); + + mBB_123 = V3MulAdd(eYZX, V3PermZXY(mART.col0), V3Mul(eZXY, V3PermYZX(mART.col0))); + mBB_456 = V3MulAdd(eYZX, V3PermZXY(mART.col1), V3Mul(eZXY, V3PermYZX(mART.col1))); + mBB_789 = V3MulAdd(eYZX, V3PermZXY(mART.col2), V3Mul(eZXY, V3PermYZX(mART.col2))); + }*/ + } + + PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const + { + const Vec3V extentsV = V3LoadU(box.mExtents); + + const Vec3V t = V3Sub(mT, V3LoadU(box.mCenter)); + + // class I - axes of AABB + if(V3OutOfBounds(t, V3Add(extentsV, mBB_xyz))) + return Ps::IntFalse; + + const Vec3V rX = mRT.col0, rY = mRT.col1, rZ = mRT.col2; + const Vec3V arX = mART.col0, arY = mART.col1, arZ = mART.col2; + + const FloatV eX = V3GetX(extentsV), eY = V3GetY(extentsV), eZ = V3GetZ(extentsV); + const FloatV tX = V3GetX(t), tY = V3GetY(t), tZ = V3GetZ(t); + + // class II - axes of OBB + { + const Vec3V v = V3ScaleAdd(rZ, tZ, V3ScaleAdd(rY, tY, V3Scale(rX, tX))); + const Vec3V v2 = V3ScaleAdd(arZ, eZ, V3ScaleAdd(arY, eY, V3ScaleAdd(arX, eX, mExtents))); + if(V3OutOfBounds(v, v2)) + return Ps::IntFalse; + } + +// if(!fullTest) + return Ps::IntTrue; + +/* // class III - edge cross products. Almost all OBB tests early-out with type I or type II, + // so early-outs here probably aren't useful (TODO: profile) + + const Vec3V va = V3NegScaleSub(rZ, tY, V3Scale(rY, tZ)); + const Vec3V va2 = V3ScaleAdd(arY, eZ, V3ScaleAdd(arZ, eY, mBB_123)); + const BoolV ba = BOr(V3IsGrtr(va, va2), V3IsGrtr(V3Neg(va2), va)); + + const Vec3V vb = V3NegScaleSub(rX, tZ, V3Scale(rZ, tX)); + const Vec3V vb2 = V3ScaleAdd(arX, eZ, V3ScaleAdd(arZ, eX, mBB_456)); + const BoolV bb = BOr(V3IsGrtr(vb, vb2), V3IsGrtr(V3Neg(vb2), vb)); + + const Vec3V vc = V3NegScaleSub(rY, tX, V3Scale(rX, tY)); + const Vec3V vc2 = V3ScaleAdd(arX, eY, V3ScaleAdd(arY, eX, mBB_789)); + const BoolV bc = BOr(V3IsGrtr(vc, vc2), V3IsGrtr(V3Neg(vc2), vc)); + + return BAllEq(BOr(ba, BOr(bb,bc)), BFFFF());*/ + } + + PX_FORCE_INLINE Ps::IntBool operator()(const PxBounds3& bounds) const + { + BucketBox tmp; + tmp.mCenter = bounds.getCenter(); + tmp.mExtents = bounds.getExtents(); + return (*this)(tmp); + } + + Vec3V mExtents; // extents of OBB + Vec3V mT; // translation of OBB + Mat33V mRT; // transpose of rotation matrix of OBB + Mat33V mART; // transpose of mRT, padded by epsilon + Vec3V mBB_xyz; // extents of OBB along coordinate axes + +/* Vec3V mBB_123; // projections of extents onto edge-cross axes + Vec3V mBB_456; + Vec3V mBB_789;*/ +}; +#else +struct OBBAABBTest_Scalar +{ + OBBAABBTest_Scalar(const PxMat33& rotation, const PxVec3& translation, const PxVec3& extents) + { + mR = rotation; + mT = translation; + mExtents = extents; + + const PxVec3 eps(1e-6f); + mAR = PxMat33(mR[0].abs() + eps, mR[1].abs() + eps, mR[2].abs() + eps); // Epsilon prevents floating-point inaccuracies (strategy borrowed from RAPID) + mBB_xyz = mAR.transform(mExtents); // Precompute box-box data - Courtesy of Erwin de Vries + +/* PxReal ex = mExtents.x, ey = mExtents.y, ez = mExtents.z; + mBB_1 = ey*mAR[2].x + ez*mAR[1].x; mBB_2 = ez*mAR[0].x + ex*mAR[2].x; mBB_3 = ex*mAR[1].x + ey*mAR[0].x; + mBB_4 = ey*mAR[2].y + ez*mAR[1].y; mBB_5 = ez*mAR[0].y + ex*mAR[2].y; mBB_6 = ex*mAR[1].y + ey*mAR[0].y; + mBB_7 = ey*mAR[2].z + ez*mAR[1].z; mBB_8 = ez*mAR[0].z + ex*mAR[2].z; mBB_9 = ex*mAR[1].z + ey*mAR[0].z;*/ + } + + PX_FORCE_INLINE Ps::IntBool operator()(const BucketBox& box) const + { + const PxVec3& c = box.mCenter; + const PxVec3& e = box.mExtents; + + const PxVec3 T = mT - c; + // Class I : A's basis vectors + if(PxAbs(T.x) > e.x + mBB_xyz.x) return Ps::IntFalse; + if(PxAbs(T.y) > e.y + mBB_xyz.y) return Ps::IntFalse; + if(PxAbs(T.z) > e.z + mBB_xyz.z) return Ps::IntFalse; + + // Class II : B's basis vectors + if(PxAbs(T.dot(mR[0])) > e.dot(mAR[0]) + mExtents.x) return Ps::IntFalse; + if(PxAbs(T.dot(mR[1])) > e.dot(mAR[1]) + mExtents.y) return Ps::IntFalse; + if(PxAbs(T.dot(mR[2])) > e.dot(mAR[2]) + mExtents.z) return Ps::IntFalse; + + // Class III : 9 cross products + if(0) + { + if(PxAbs(T.z*mR[0].y - T.y*mR[0].z) > e.y*mAR[0].z + e.z*mAR[0].y + mBB_1) return Ps::IntFalse; // L = A0 x B0 + if(PxAbs(T.z*mR[1].y - T.y*mR[1].z) > e.y*mAR[1].z + e.z*mAR[1].y + mBB_2) return Ps::IntFalse; // L = A0 x B1 + if(PxAbs(T.z*mR[2].y - T.y*mR[2].z) > e.y*mAR[2].z + e.z*mAR[2].y + mBB_3) return Ps::IntFalse; // L = A0 x B2 + + if(PxAbs(T.x*mR[0].z - T.z*mR[0].x) > e.x*mAR[0].z + e.z*mAR[0].x + mBB_4) return Ps::IntFalse; // L = A1 x B0 + if(PxAbs(T.x*mR[1].z - T.z*mR[1].x) > e.x*mAR[1].z + e.z*mAR[1].x + mBB_5) return Ps::IntFalse; // L = A1 x B1 + if(PxAbs(T.x*mR[2].z - T.z*mR[2].x) > e.x*mAR[2].z + e.z*mAR[2].x + mBB_6) return Ps::IntFalse; // L = A1 x B2 + + if(PxAbs(T.y*mR[0].x - T.x*mR[0].y) > e.x*mAR[0].y + e.y*mAR[0].x + mBB_7) return Ps::IntFalse; // L = A2 x B0 + if(PxAbs(T.y*mR[1].x - T.x*mR[1].y) > e.x*mAR[1].y + e.y*mAR[1].x + mBB_8) return Ps::IntFalse; // L = A2 x B1 + if(PxAbs(T.y*mR[2].x - T.x*mR[2].y) > e.x*mAR[2].y + e.y*mAR[2].x + mBB_9) return Ps::IntFalse; // L = A2 x B2 + } + return Ps::IntTrue; + } + +private: + PxMat33 mR; // rotation matrix + PxMat33 mAR; // absolute rotation matrix + PxVec3 mT; // translation from obb space to model space + PxVec3 mExtents; + + PxVec3 mBB_xyz; + + float mBB_1, mBB_2, mBB_3; + float mBB_4, mBB_5, mBB_6; + float mBB_7, mBB_8, mBB_9; +}; +#endif + +#ifdef USE_SIMD +typedef OBBAABBTest_SIMD BucketPrunerOBBAABBTest; +#else +typedef OBBAABBTest_Scalar BucketPrunerOBBAABBTest; +#endif + +/////////////////////////////////////////////////////////////////////////////// + +PxAgain BucketPrunerCore::overlap(const ShapeData& queryVolume, PrunerCallback& pcb) const +{ + PX_ASSERT(!mDirty); + PxAgain again = true; + + const PxBounds3& cullBox = queryVolume.getPrunerInflatedWorldAABB(); + + switch(queryVolume.getType()) + { + case PxGeometryType::eBOX: + { + if(queryVolume.isOBB()) + { + const BucketPrunerOverlapTraversal overlap; + again = overlap(*this, + BucketPrunerOBBAABBTest( + queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(), + queryVolume.getPrunerBoxGeomExtentsInflated()), + pcb, cullBox); + } + else + { + const BucketPrunerOverlapTraversal overlap; + again = overlap(*this, BucketPrunerAABBAABBTest(cullBox), pcb, cullBox); + } + } + break; + + case PxGeometryType::eCAPSULE: + { + const BucketPrunerOverlapTraversal overlap; + again = overlap(*this, + BucketPrunerOBBAABBTest( + queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(), + queryVolume.getPrunerBoxGeomExtentsInflated()), + pcb, cullBox); + } + break; + + case PxGeometryType::eSPHERE: + { + const Sphere& sphere = queryVolume.getGuSphere(); + const PxVec3 sphereExtents(sphere.radius); + const BucketPrunerOverlapTraversal overlap; + again = overlap(*this, BucketPrunerSphereAABBTest(sphere), pcb, cullBox); + } + break; + + case PxGeometryType::eCONVEXMESH: + { + const BucketPrunerOverlapTraversal overlap; + again = overlap(*this, + BucketPrunerOBBAABBTest( + queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(), + queryVolume.getPrunerBoxGeomExtentsInflated()), + pcb, cullBox); + } + break; + + case PxGeometryType::ePLANE: + case PxGeometryType::eTRIANGLEMESH: + case PxGeometryType::eHEIGHTFIELD: + case PxGeometryType::eGEOMETRY_COUNT: + case PxGeometryType::eINVALID: + PX_ALWAYS_ASSERT_MESSAGE("unsupported overlap query volume geometry type"); + } + return again; +} + +/////////////////////////////////////////////////////////////////////////////// + +void BucketPrunerCore::shiftOrigin(const PxVec3& shift) +{ + for(PxU32 i=0;i the pair is persistent + return &activePairs[offset]; +} + +// Internal version saving hash computation +PX_FORCE_INLINE BucketPrunerPair* BucketPrunerMap::findPair(const PrunerPayload& payload, PxU32 hashValue) const +{ + if(!mHashTable) + return NULL; // Nothing has been allocated yet + + BucketPrunerPair* PX_RESTRICT activePairs = mActivePairs; + const PxU32* PX_RESTRICT next = mNext; + + // Look for it in the table + PxU32 offset = mHashTable[hashValue]; + while(offset!=INVALID_ID && differentPair(activePairs[offset], payload)) + { + offset = next[offset]; // Better to have a separate array for this + } + if(offset==INVALID_ID) + return NULL; + PX_ASSERT(offset the pair is persistent + return &activePairs[offset]; +} + +/////////////////////////////////////////////////////////////////////////////// + +BucketPrunerPair* BucketPrunerMap::addPair(const PrunerPayload& payload, PxU32 coreIndex, PxU32 timeStamp) +{ + PxU32 hashValue = hash(payload) & mMask; + + { + BucketPrunerPair* PX_RESTRICT p = findPair(payload, hashValue); + if(p) + { + PX_ASSERT(p->mCoreIndex==coreIndex); + PX_ASSERT(p->mTimeStamp==timeStamp); + return p; // Persistent pair + } + } + + // This is a new pair + if(mNbActivePairs >= mHashSize) + { + // Get more entries + mHashSize = Ps::nextPowerOfTwo(mNbActivePairs+1); + mMask = mHashSize-1; + + reallocPairs(); + + // Recompute hash value with new hash size + hashValue = hash(payload) & mMask; // ### redundant hash computation here? + } + + BucketPrunerPair* PX_RESTRICT p = &mActivePairs[mNbActivePairs]; + p->mPayload = payload; + p->mCoreIndex = coreIndex; + p->mTimeStamp = timeStamp; + mNext[mNbActivePairs] = mHashTable[hashValue]; + mHashTable[hashValue] = mNbActivePairs++; + return p; +} + +/////////////////////////////////////////////////////////////////////////////// + +void BucketPrunerMap::removePairInternal(const PrunerPayload& /*payload*/, PxU32 hashValue, PxU32 pairIndex) +{ + // Walk the hash table to fix mNext + { + PxU32 offset = mHashTable[hashValue]; + PX_ASSERT(offset!=INVALID_ID); + + PxU32 previous=INVALID_ID; + while(offset!=pairIndex) + { + previous = offset; + offset = mNext[offset]; + } + + // Let us go/jump us + if(previous!=INVALID_ID) + { + PX_ASSERT(mNext[previous]==pairIndex); + mNext[previous] = mNext[pairIndex]; + } + // else we were the first + else mHashTable[hashValue] = mNext[pairIndex]; + // we're now free to reuse mNext[pairIndex] without breaking the list + } +#if PX_DEBUG + mNext[pairIndex]=INVALID_ID; +#endif + // Invalidate entry + + // Fill holes + if(1) + { + // 1) Remove last pair + const PxU32 lastPairIndex = mNbActivePairs-1; + if(lastPairIndex==pairIndex) + { + mNbActivePairs--; + } + else + { + const BucketPrunerPair* last = &mActivePairs[lastPairIndex]; + const PxU32 lastHashValue = hash(last->mPayload) & mMask; + + // Walk the hash table to fix mNext + PxU32 offset = mHashTable[lastHashValue]; + PX_ASSERT(offset!=INVALID_ID); + + PxU32 previous=INVALID_ID; + while(offset!=lastPairIndex) + { + previous = offset; + offset = mNext[offset]; + } + + // Let us go/jump us + if(previous!=INVALID_ID) + { + PX_ASSERT(mNext[previous]==lastPairIndex); + mNext[previous] = mNext[lastPairIndex]; + } + // else we were the first + else mHashTable[lastHashValue] = mNext[lastPairIndex]; + // we're now free to reuse mNext[lastPairIndex] without breaking the list + +#if PX_DEBUG + mNext[lastPairIndex]=INVALID_ID; +#endif + + // Don't invalidate entry since we're going to shrink the array + + // 2) Re-insert in free slot + mActivePairs[pairIndex] = mActivePairs[lastPairIndex]; +#if PX_DEBUG + PX_ASSERT(mNext[pairIndex]==INVALID_ID); +#endif + mNext[pairIndex] = mHashTable[lastHashValue]; + mHashTable[lastHashValue] = pairIndex; + + mNbActivePairs--; + } + } +} + +/////////////////////////////////////////////////////////////////////////////// + +bool BucketPrunerMap::removePair(const PrunerPayload& payload, PxU32& coreIndex, PxU32& timeStamp) +{ + const PxU32 hashValue = hash(payload) & mMask; + const BucketPrunerPair* p = findPair(payload, hashValue); + if(!p) + return false; + PX_ASSERT(p->mPayload==payload); + + coreIndex = p->mCoreIndex; + timeStamp = p->mTimeStamp; + + removePairInternal(payload, hashValue, getPairIndex(p)); + + shrinkMemory(); + return true; +} + +/////////////////////////////////////////////////////////////////////////////// + +void BucketPrunerMap::shrinkMemory() +{ + // Check correct memory against actually used memory + const PxU32 correctHashSize = Ps::nextPowerOfTwo(mNbActivePairs); + if(mHashSize==correctHashSize) + return; + + if(mReservedMemory && correctHashSize < mReservedMemory) + return; + + // Reduce memory used + mHashSize = correctHashSize; + mMask = mHashSize-1; + + reallocPairs(); +} + +/////////////////////////////////////////////////////////////////////////////// + + static PX_FORCE_INLINE void storeDwords(PxU32* dest, PxU32 nb, PxU32 value) + { + while(nb--) + *dest++ = value; + } + +void BucketPrunerMap::reallocPairs() +{ + MBP_FREE(mHashTable); + mHashTable = reinterpret_cast(MBP_ALLOC(mHashSize*sizeof(PxU32))); + storeDwords(mHashTable, mHashSize, INVALID_ID); + + // Get some bytes for new entries + BucketPrunerPair* newPairs = reinterpret_cast(MBP_ALLOC(mHashSize * sizeof(BucketPrunerPair))); + PX_ASSERT(newPairs); + + PxU32* newNext = reinterpret_cast(MBP_ALLOC(mHashSize * sizeof(PxU32))); + PX_ASSERT(newNext); + + // Copy old data if needed + if(mNbActivePairs) + PxMemCopy(newPairs, mActivePairs, mNbActivePairs*sizeof(BucketPrunerPair)); + // ### check it's actually needed... probably only for pairs whose hash value was cut by the and + // yeah, since hash(id0, id1) is a constant + // However it might not be needed to recompute them => only less efficient but still ok + for(PxU32 i=0;i