diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp')
| -rw-r--r-- | PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp | 581 |
1 files changed, 581 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp b/PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp new file mode 100644 index 00000000..9d7bd57a --- /dev/null +++ b/PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp @@ -0,0 +1,581 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +/* +General notes: + + rtree depth-first traversal looks like this: + push top level page onto stack + + pop page from stack + for each node in page + if node overlaps with testrect + push node's subpage + + we want to efficiently keep track of current stack level to know if the current page is a leaf or not + (since we don't store a flag with the page due to no space, we can't determine it just by looking at current page) + since we traverse depth first, the levels for nodes on the stack look like this: + l0 l0 l1 l2 l2 l3 l3 l3 l4 + + we can encode this as an array of 4 bits per level count into a 32-bit integer + to simplify the code->level computation we also keep track of current level by incrementing the level whenever any subpages + from current test page are pushed onto the stack + when we pop a page off the stack we use this encoding to determine if we should decrement the stack level +*/ + +#include "foundation/PxBounds3.h" +#include "GuRTree.h" +#include "PsIntrinsics.h" +#include "GuBox.h" +#include "PsVecMath.h" +#include "PxQueryReport.h" // for PxAgain +#include "PsBitUtils.h" + +//#define VERIFY_RTREE +#ifdef VERIFY_RTREE +#include "GuIntersectionRayBox.h" +#include "GuIntersectionBoxBox.h" +#include "stdio.h" +#endif + +using namespace physx; +using namespace physx::shdfnd; +using namespace Ps::aos; + +namespace physx +{ +namespace Gu { + +using namespace Ps::aos; + +#define v_absm(a) V4Andc(a, signMask) +#define V4FromF32A(x) V4LoadA(x) +#define PxF32FV(x) FStore(x) +#define CAST_U8(a) reinterpret_cast<PxU8*>(a) + +///////////////////////////////////////////////////////////////////////// +void RTree::traverseAABB(const PxVec3& boxMin, const PxVec3& boxMax, const PxU32 maxResults, PxU32* resultsPtr, Callback* callback) const +{ + PX_UNUSED(resultsPtr); + + PX_ASSERT(callback); + PX_ASSERT(maxResults >= mPageSize); + PX_UNUSED(maxResults); + + const PxU32 maxStack = 128; + PxU32 stack1[maxStack]; + PxU32* stack = stack1+1; + + PX_ASSERT(mPages); + PX_ASSERT((uintptr_t(mPages) & 127) == 0); + PX_ASSERT((uintptr_t(this) & 15) == 0); + + // conservatively quantize the input box + Vec4V nqMin = Vec4V_From_PxVec3_WUndefined(boxMin); + Vec4V nqMax = Vec4V_From_PxVec3_WUndefined(boxMax); + + Vec4V nqMinx4 = V4SplatElement<0>(nqMin); + Vec4V nqMiny4 = V4SplatElement<1>(nqMin); + Vec4V nqMinz4 = V4SplatElement<2>(nqMin); + Vec4V nqMaxx4 = V4SplatElement<0>(nqMax); + Vec4V nqMaxy4 = V4SplatElement<1>(nqMax); + Vec4V nqMaxz4 = V4SplatElement<2>(nqMax); + + // on 64-bit platforms the dynamic rtree pointer is also relative to mPages + PxU8* treeNodes8 = CAST_U8(mPages); + PxU32* stackPtr = stack; + + // AP potential perf optimization - fetch the top level right away + PX_ASSERT(RTREE_N == 4 || RTREE_N == 8); + PX_ASSERT(Ps::isPowerOfTwo(mPageSize)); + + for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --) + *stackPtr++ = j*sizeof(RTreePage); + + PxU32 cacheTopValid = true; + PxU32 cacheTop = 0; + + do { + stackPtr--; + PxU32 top; + if (cacheTopValid) // branch is faster than lhs + top = cacheTop; + else + top = stackPtr[0]; + PX_ASSERT(!cacheTopValid || stackPtr[0] == cacheTop); + RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top); + const PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs; + + Vec4V minx4 = V4LoadA(tn->minx); + Vec4V miny4 = V4LoadA(tn->miny); + Vec4V minz4 = V4LoadA(tn->minz); + Vec4V maxx4 = V4LoadA(tn->maxx); + Vec4V maxy4 = V4LoadA(tn->maxy); + Vec4V maxz4 = V4LoadA(tn->maxz); + + // AABB/AABB overlap test + BoolV res0 = V4IsGrtr(nqMinx4, maxx4); BoolV res1 = V4IsGrtr(nqMiny4, maxy4); BoolV res2 = V4IsGrtr(nqMinz4, maxz4); + BoolV res3 = V4IsGrtr(minx4, nqMaxx4); BoolV res4 = V4IsGrtr(miny4, nqMaxy4); BoolV res5 = V4IsGrtr(minz4, nqMaxz4); + BoolV resx = BOr(BOr(BOr(res0, res1), BOr(res2, res3)), BOr(res4, res5)); + PX_ALIGN_PREFIX(16) PxU32 resa[RTREE_N] PX_ALIGN_SUFFIX(16); + + VecU32V res4x = VecU32V_From_BoolV(resx); + U4StoreA(res4x, resa); + + cacheTopValid = false; + for (PxU32 i = 0; i < RTREE_N; i++) + { + PxU32 ptr = ptrs[i] & ~1; // clear the isLeaf bit + if (resa[i]) + continue; + if (tn->isLeaf(i)) + { + if (!callback->processResults(1, &ptr)) + return; + } + else + { + *(stackPtr++) = ptr; + cacheTop = ptr; + cacheTopValid = true; + } + } + } while (stackPtr > stack); +} + +namespace +{ + const VecU32V signMask = U4LoadXYZW((PxU32(1)<<31), (PxU32(1)<<31), (PxU32(1)<<31), (PxU32(1)<<31)); + const Vec4V epsFloat4 = V4Load(1e-9f); + const Vec4V zeroes = V4Zero(); + const Vec4V twos = V4Load(2.0f); + const Vec4V epsInflateFloat4 = V4Load(1e-7f); +} + +///////////////////////////////////////////////////////////////////////// +template <int inflate> +void RTree::traverseRay( + const PxVec3& rayOrigin, const PxVec3& rayDir, + const PxU32 maxResults, PxU32* resultsPtr, Gu::RTree::CallbackRaycast* callback, + const PxVec3* fattenAABBs, PxF32 maxT) const +{ + // implements Kay-Kajiya 4-way SIMD test + PX_UNUSED(resultsPtr); + PX_UNUSED(maxResults); + + const PxU32 maxStack = 128; + PxU32 stack1[maxStack]; + PxU32* stack = stack1+1; + + PX_ASSERT(mPages); + PX_ASSERT((uintptr_t(mPages) & 127) == 0); + PX_ASSERT((uintptr_t(this) & 15) == 0); + + PxU8* treeNodes8 = CAST_U8(mPages); + + Vec4V fattenAABBsX, fattenAABBsY, fattenAABBsZ; + PX_UNUSED(fattenAABBsX); PX_UNUSED(fattenAABBsY); PX_UNUSED(fattenAABBsZ); + if (inflate) + { + Vec4V fattenAABBs4 = Vec4V_From_PxVec3_WUndefined(*fattenAABBs); + fattenAABBs4 = V4Add(fattenAABBs4, epsInflateFloat4); // US2385 - shapes are "closed" meaning exactly touching shapes should report overlap + fattenAABBsX = V4SplatElement<0>(fattenAABBs4); + fattenAABBsY = V4SplatElement<1>(fattenAABBs4); + fattenAABBsZ = V4SplatElement<2>(fattenAABBs4); + } + + Vec4V maxT4; + maxT4 = V4Load(maxT); + Vec4V rayP = Vec4V_From_PxVec3_WUndefined(rayOrigin); + Vec4V rayD = Vec4V_From_PxVec3_WUndefined(rayDir); + VecU32V raySign = V4U32and(VecU32V_ReinterpretFrom_Vec4V(rayD), signMask); + Vec4V rayDAbs = V4Abs(rayD); // abs value of rayD + Vec4V rayInvD = Vec4V_ReinterpretFrom_VecU32V(V4U32or(raySign, VecU32V_ReinterpretFrom_Vec4V(V4Max(rayDAbs, epsFloat4)))); // clamp near-zero components up to epsilon + rayD = rayInvD; + + //rayInvD = V4Recip(rayInvD); + // Newton-Raphson iteration for reciprocal (see wikipedia): + // X[n+1] = X[n]*(2-original*X[n]), X[0] = V4RecipFast estimate + //rayInvD = rayInvD*(twos-rayD*rayInvD); + rayInvD = V4RecipFast(rayInvD); // initial estimate, not accurate enough + rayInvD = V4Mul(rayInvD, V4NegMulSub(rayD, rayInvD, twos)); + + // P+tD=a; t=(a-P)/D + // t=(a - p.x)*1/d.x = a/d.x +(- p.x/d.x) + Vec4V rayPinvD = V4NegMulSub(rayInvD, rayP, zeroes); + Vec4V rayInvDsplatX = V4SplatElement<0>(rayInvD); + Vec4V rayInvDsplatY = V4SplatElement<1>(rayInvD); + Vec4V rayInvDsplatZ = V4SplatElement<2>(rayInvD); + Vec4V rayPinvDsplatX = V4SplatElement<0>(rayPinvD); + Vec4V rayPinvDsplatY = V4SplatElement<1>(rayPinvD); + Vec4V rayPinvDsplatZ = V4SplatElement<2>(rayPinvD); + + PX_ASSERT(RTREE_N == 4 || RTREE_N == 8); + PX_ASSERT(mNumRootPages > 0); + + PxU32 stackPtr = 0; + for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --) + stack[stackPtr++] = j*sizeof(RTreePage); + + PX_ALIGN_PREFIX(16) PxU32 resa[4] PX_ALIGN_SUFFIX(16); + + while (stackPtr) + { + PxU32 top = stack[--stackPtr]; + if (top&1) // isLeaf test + { + top--; + PxF32 newMaxT = maxT; + if (!callback->processResults(1, &top, newMaxT)) + return; + /* shrink the ray if newMaxT is reduced compared to the original maxT */ + if (maxT != newMaxT) + { + PX_ASSERT(newMaxT < maxT); + maxT = newMaxT; + maxT4 = V4Load(newMaxT); + } + continue; + } + + RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top); + + // 6i load + Vec4V minx4a = V4LoadA(tn->minx), miny4a = V4LoadA(tn->miny), minz4a = V4LoadA(tn->minz); + Vec4V maxx4a = V4LoadA(tn->maxx), maxy4a = V4LoadA(tn->maxy), maxz4a = V4LoadA(tn->maxz); + + // 1i disabled test + // AP scaffold - optimization opportunity - can save 2 instructions here + VecU32V ignore4a = V4IsGrtrV32u(minx4a, maxx4a); // 1 if degenerate box (empty slot in the page) + + if (inflate) + { + // 6i + maxx4a = V4Add(maxx4a, fattenAABBsX); maxy4a = V4Add(maxy4a, fattenAABBsY); maxz4a = V4Add(maxz4a, fattenAABBsZ); + minx4a = V4Sub(minx4a, fattenAABBsX); miny4a = V4Sub(miny4a, fattenAABBsY); minz4a = V4Sub(minz4a, fattenAABBsZ); + } + + // P+tD=a; t=(a-P)/D + // t=(a - p.x)*1/d.x = a/d.x +(- p.x/d.x) + // 6i + Vec4V tminxa0 = V4MulAdd(minx4a, rayInvDsplatX, rayPinvDsplatX); + Vec4V tminya0 = V4MulAdd(miny4a, rayInvDsplatY, rayPinvDsplatY); + Vec4V tminza0 = V4MulAdd(minz4a, rayInvDsplatZ, rayPinvDsplatZ); + Vec4V tmaxxa0 = V4MulAdd(maxx4a, rayInvDsplatX, rayPinvDsplatX); + Vec4V tmaxya0 = V4MulAdd(maxy4a, rayInvDsplatY, rayPinvDsplatY); + Vec4V tmaxza0 = V4MulAdd(maxz4a, rayInvDsplatZ, rayPinvDsplatZ); + + // test half-spaces + // P+tD=dN + // t = (d(N,D)-(P,D))/(D,D) , (D,D)=1 + + // compute 4x dot products (N,D) and (P,N) for each AABB in the page + + // 6i + // now compute tnear and tfar for each pair of planes for each box + Vec4V tminxa = V4Min(tminxa0, tmaxxa0); Vec4V tmaxxa = V4Max(tminxa0, tmaxxa0); + Vec4V tminya = V4Min(tminya0, tmaxya0); Vec4V tmaxya = V4Max(tminya0, tmaxya0); + Vec4V tminza = V4Min(tminza0, tmaxza0); Vec4V tmaxza = V4Max(tminza0, tmaxza0); + + // 8i + Vec4V maxOfNeasa = V4Max(V4Max(tminxa, tminya), tminza); + Vec4V minOfFarsa = V4Min(V4Min(tmaxxa, tmaxya), tmaxza); + ignore4a = V4U32or(ignore4a, V4IsGrtrV32u(epsFloat4, minOfFarsa)); // if tfar is negative, ignore since its a ray, not a line + // AP scaffold: update the build to eliminate 3 more instructions for ignore4a above + //VecU32V ignore4a = V4IsGrtrV32u(epsFloat4, minOfFarsa); // if tfar is negative, ignore since its a ray, not a line + ignore4a = V4U32or(ignore4a, V4IsGrtrV32u(maxOfNeasa, maxT4)); // if tnear is over maxT, ignore this result + + // 2i + VecU32V resa4 = V4IsGrtrV32u(maxOfNeasa, minOfFarsa); // if 1 => fail + resa4 = V4U32or(resa4, ignore4a); + + // 1i + V4U32StoreAligned(resa4, reinterpret_cast<VecU32V*>(resa)); + + PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs; + + stack[stackPtr] = ptrs[0]; stackPtr += (1+resa[0]); // AP scaffold TODO: use VecU32add + stack[stackPtr] = ptrs[1]; stackPtr += (1+resa[1]); + stack[stackPtr] = ptrs[2]; stackPtr += (1+resa[2]); + stack[stackPtr] = ptrs[3]; stackPtr += (1+resa[3]); + } +} + +template void RTree::traverseRay<0>( + const PxVec3&, const PxVec3&, const PxU32, PxU32*, Gu::RTree::CallbackRaycast*, const PxVec3*, PxF32 maxT) const; +template void RTree::traverseRay<1>( + const PxVec3&, const PxVec3&, const PxU32, PxU32*, Gu::RTree::CallbackRaycast*, const PxVec3*, PxF32 maxT) const; + +///////////////////////////////////////////////////////////////////////// +void RTree::traverseOBB( + const Gu::Box& obb, const PxU32 maxResults, PxU32* resultsPtr, Gu::RTree::Callback* callback) const +{ + PX_UNUSED(resultsPtr); + PX_UNUSED(maxResults); + + const PxU32 maxStack = 128; + PxU32 stack[maxStack]; + + PX_ASSERT(mPages); + PX_ASSERT((uintptr_t(mPages) & 127) == 0); + PX_ASSERT((uintptr_t(this) & 15) == 0); + + PxU8* treeNodes8 = CAST_U8(mPages); + PxU32* stackPtr = stack; + + Vec4V ones, halves, eps; + ones = V4Load(1.0f); + halves = V4Load(0.5f); + eps = V4Load(1e-6f); + + PX_UNUSED(ones); + + Vec4V obbO = Vec4V_From_PxVec3_WUndefined(obb.center); + Vec4V obbE = Vec4V_From_PxVec3_WUndefined(obb.extents); + // Gu::Box::rot matrix columns are the OBB axes + Vec4V obbX = Vec4V_From_PxVec3_WUndefined(obb.rot.column0); + Vec4V obbY = Vec4V_From_PxVec3_WUndefined(obb.rot.column1); + Vec4V obbZ = Vec4V_From_PxVec3_WUndefined(obb.rot.column2); + +#if PX_WINDOWS || PX_XBOXONE + // Visual Studio compiler hangs with #defines + // On VMX platforms we use #defines in the other branch of this #ifdef to avoid register spills (LHS) + Vec4V obbESplatX = V4SplatElement<0>(obbE); + Vec4V obbESplatY = V4SplatElement<1>(obbE); + Vec4V obbESplatZ = V4SplatElement<2>(obbE); + Vec4V obbESplatNegX = V4Sub(zeroes, obbESplatX); + Vec4V obbESplatNegY = V4Sub(zeroes, obbESplatY); + Vec4V obbESplatNegZ = V4Sub(zeroes, obbESplatZ); + Vec4V obbXE = V4MulAdd(obbX, obbESplatX, zeroes); // scale axii by E + Vec4V obbYE = V4MulAdd(obbY, obbESplatY, zeroes); // scale axii by E + Vec4V obbZE = V4MulAdd(obbZ, obbESplatZ, zeroes); // scale axii by E + Vec4V obbOSplatX = V4SplatElement<0>(obbO); + Vec4V obbOSplatY = V4SplatElement<1>(obbO); + Vec4V obbOSplatZ = V4SplatElement<2>(obbO); + Vec4V obbXSplatX = V4SplatElement<0>(obbX); + Vec4V obbXSplatY = V4SplatElement<1>(obbX); + Vec4V obbXSplatZ = V4SplatElement<2>(obbX); + Vec4V obbYSplatX = V4SplatElement<0>(obbY); + Vec4V obbYSplatY = V4SplatElement<1>(obbY); + Vec4V obbYSplatZ = V4SplatElement<2>(obbY); + Vec4V obbZSplatX = V4SplatElement<0>(obbZ); + Vec4V obbZSplatY = V4SplatElement<1>(obbZ); + Vec4V obbZSplatZ = V4SplatElement<2>(obbZ); + Vec4V obbXESplatX = V4SplatElement<0>(obbXE); + Vec4V obbXESplatY = V4SplatElement<1>(obbXE); + Vec4V obbXESplatZ = V4SplatElement<2>(obbXE); + Vec4V obbYESplatX = V4SplatElement<0>(obbYE); + Vec4V obbYESplatY = V4SplatElement<1>(obbYE); + Vec4V obbYESplatZ = V4SplatElement<2>(obbYE); + Vec4V obbZESplatX = V4SplatElement<0>(obbZE); + Vec4V obbZESplatY = V4SplatElement<1>(obbZE); + Vec4V obbZESplatZ = V4SplatElement<2>(obbZE); +#else + #define obbESplatX V4SplatElement<0>(obbE) + #define obbESplatY V4SplatElement<1>(obbE) + #define obbESplatZ V4SplatElement<2>(obbE) + #define obbESplatNegX V4Sub(zeroes, obbESplatX) + #define obbESplatNegY V4Sub(zeroes, obbESplatY) + #define obbESplatNegZ V4Sub(zeroes, obbESplatZ) + #define obbXE V4MulAdd(obbX, obbESplatX, zeroes) + #define obbYE V4MulAdd(obbY, obbESplatY, zeroes) + #define obbZE V4MulAdd(obbZ, obbESplatZ, zeroes) + #define obbOSplatX V4SplatElement<0>(obbO) + #define obbOSplatY V4SplatElement<1>(obbO) + #define obbOSplatZ V4SplatElement<2>(obbO) + #define obbXSplatX V4SplatElement<0>(obbX) + #define obbXSplatY V4SplatElement<1>(obbX) + #define obbXSplatZ V4SplatElement<2>(obbX) + #define obbYSplatX V4SplatElement<0>(obbY) + #define obbYSplatY V4SplatElement<1>(obbY) + #define obbYSplatZ V4SplatElement<2>(obbY) + #define obbZSplatX V4SplatElement<0>(obbZ) + #define obbZSplatY V4SplatElement<1>(obbZ) + #define obbZSplatZ V4SplatElement<2>(obbZ) + #define obbXESplatX V4SplatElement<0>(obbXE) + #define obbXESplatY V4SplatElement<1>(obbXE) + #define obbXESplatZ V4SplatElement<2>(obbXE) + #define obbYESplatX V4SplatElement<0>(obbYE) + #define obbYESplatY V4SplatElement<1>(obbYE) + #define obbYESplatZ V4SplatElement<2>(obbYE) + #define obbZESplatX V4SplatElement<0>(obbZE) + #define obbZESplatY V4SplatElement<1>(obbZE) + #define obbZESplatZ V4SplatElement<2>(obbZE) +#endif + + PX_ASSERT(mPageSize == 4 || mPageSize == 8); + PX_ASSERT(mNumRootPages > 0); + + for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --) + *stackPtr++ = j*sizeof(RTreePage); + PxU32 cacheTopValid = true; + PxU32 cacheTop = 0; + + PX_ALIGN_PREFIX(16) PxU32 resa_[4] PX_ALIGN_SUFFIX(16); + + do { + stackPtr--; + + PxU32 top; + if (cacheTopValid) // branch is faster than lhs + top = cacheTop; + else + top = stackPtr[0]; + PX_ASSERT(!cacheTopValid || top == cacheTop); + RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top); + + const PxU32 offs = 0; + PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs; + + // 6i + Vec4V minx4a = V4LoadA(tn->minx+offs); + Vec4V miny4a = V4LoadA(tn->miny+offs); + Vec4V minz4a = V4LoadA(tn->minz+offs); + Vec4V maxx4a = V4LoadA(tn->maxx+offs); + Vec4V maxy4a = V4LoadA(tn->maxy+offs); + Vec4V maxz4a = V4LoadA(tn->maxz+offs); + + VecU32V noOverlapa; + VecU32V resa4u; + { + // PRECOMPUTE FOR A BLOCK + // 109 instr per 4 OBB/AABB + // ABB iteration 1, start with OBB origin as other point -- 6 + Vec4V p1ABBxa = V4Max(minx4a, V4Min(maxx4a, obbOSplatX)); + Vec4V p1ABBya = V4Max(miny4a, V4Min(maxy4a, obbOSplatY)); + Vec4V p1ABBza = V4Max(minz4a, V4Min(maxz4a, obbOSplatZ)); + + // OBB iteration 1, move to OBB space first -- 12 + Vec4V p1ABBOxa = V4Sub(p1ABBxa, obbOSplatX); + Vec4V p1ABBOya = V4Sub(p1ABBya, obbOSplatY); + Vec4V p1ABBOza = V4Sub(p1ABBza, obbOSplatZ); + Vec4V obbPrjXa = V4MulAdd(p1ABBOxa, obbXSplatX, V4MulAdd(p1ABBOya, obbXSplatY, V4MulAdd(p1ABBOza, obbXSplatZ, zeroes))); + Vec4V obbPrjYa = V4MulAdd(p1ABBOxa, obbYSplatX, V4MulAdd(p1ABBOya, obbYSplatY, V4MulAdd(p1ABBOza, obbYSplatZ, zeroes))); + Vec4V obbPrjZa = V4MulAdd(p1ABBOxa, obbZSplatX, V4MulAdd(p1ABBOya, obbZSplatY, V4MulAdd(p1ABBOza, obbZSplatZ, zeroes))); + // clamp AABB point in OBB space to OBB extents. Since we scaled the axii, the extents are [-1,1] -- 6 + Vec4V pOBBxa = V4Max(obbESplatNegX, V4Min(obbPrjXa, obbESplatX)); + Vec4V pOBBya = V4Max(obbESplatNegY, V4Min(obbPrjYa, obbESplatY)); + Vec4V pOBBza = V4Max(obbESplatNegZ, V4Min(obbPrjZa, obbESplatZ)); + // go back to AABB space. we have x,y,z in obb space, need to multiply by axii -- 9 + Vec4V p1OBBxa = V4MulAdd(pOBBxa, obbXSplatX, V4MulAdd(pOBBya, obbYSplatX, V4MulAdd(pOBBza, obbZSplatX, obbOSplatX))); + Vec4V p1OBBya = V4MulAdd(pOBBxa, obbXSplatY, V4MulAdd(pOBBya, obbYSplatY, V4MulAdd(pOBBza, obbZSplatY, obbOSplatY))); + Vec4V p1OBBza = V4MulAdd(pOBBxa, obbXSplatZ, V4MulAdd(pOBBya, obbYSplatZ, V4MulAdd(pOBBza, obbZSplatZ, obbOSplatZ))); + + // ABB iteration 2 -- 6 instructions + Vec4V p2ABBxa = V4Max(minx4a, V4Min(maxx4a, p1OBBxa)); + Vec4V p2ABBya = V4Max(miny4a, V4Min(maxy4a, p1OBBya)); + Vec4V p2ABBza = V4Max(minz4a, V4Min(maxz4a, p1OBBza)); + // above blocks add up to 12+12+15=39 instr + // END PRECOMPUTE FOR A BLOCK + + // for AABBs precompute extents and center -- 9i + Vec4V abbCxa = V4MulAdd(V4Add(maxx4a, minx4a), halves, zeroes); + Vec4V abbCya = V4MulAdd(V4Add(maxy4a, miny4a), halves, zeroes); + Vec4V abbCza = V4MulAdd(V4Add(maxz4a, minz4a), halves, zeroes); + Vec4V abbExa = V4Sub(maxx4a, abbCxa); + Vec4V abbEya = V4Sub(maxy4a, abbCya); + Vec4V abbEza = V4Sub(maxz4a, abbCza); + + // now test separating axes D1 = p1OBB-p1ABB and D2 = p1OBB-p2ABB -- 37 instructions per axis + // D1 first -- 3 instructions + Vec4V d1xa = V4Sub(p1OBBxa, p1ABBxa), d1ya = V4Sub(p1OBBya, p1ABBya), d1za = V4Sub(p1OBBza, p1ABBza); + + // for AABB compute projections of extents and center -- 6 + Vec4V abbExd1Prja = V4MulAdd(d1xa, abbExa, zeroes); + Vec4V abbEyd1Prja = V4MulAdd(d1ya, abbEya, zeroes); + Vec4V abbEzd1Prja = V4MulAdd(d1za, abbEza, zeroes); + Vec4V abbCd1Prja = V4MulAdd(d1xa, abbCxa, V4MulAdd(d1ya, abbCya, V4MulAdd(d1za, abbCza, zeroes))); + + // for obb project each halfaxis and origin and add abs values of half-axis projections -- 12 instructions + Vec4V obbXEd1Prja = V4MulAdd(d1xa, obbXESplatX, V4MulAdd(d1ya, obbXESplatY, V4MulAdd(d1za, obbXESplatZ, zeroes))); + Vec4V obbYEd1Prja = V4MulAdd(d1xa, obbYESplatX, V4MulAdd(d1ya, obbYESplatY, V4MulAdd(d1za, obbYESplatZ, zeroes))); + Vec4V obbZEd1Prja = V4MulAdd(d1xa, obbZESplatX, V4MulAdd(d1ya, obbZESplatY, V4MulAdd(d1za, obbZESplatZ, zeroes))); + Vec4V obbOd1Prja = V4MulAdd(d1xa, obbOSplatX, V4MulAdd(d1ya, obbOSplatY, V4MulAdd(d1za, obbOSplatZ, zeroes))); + + // compare lengths between projected centers with sum of projected radii -- 16i + Vec4V originDiffd1a = v_absm(V4Sub(abbCd1Prja, obbOd1Prja)); + Vec4V absABBRd1a = V4Add(V4Add(v_absm(abbExd1Prja), v_absm(abbEyd1Prja)), v_absm(abbEzd1Prja)); + Vec4V absOBBRd1a = V4Add(V4Add(v_absm(obbXEd1Prja), v_absm(obbYEd1Prja)), v_absm(obbZEd1Prja)); + VecU32V noOverlapd1a = V4IsGrtrV32u(V4Sub(originDiffd1a, eps), V4Add(absABBRd1a, absOBBRd1a)); + VecU32V epsNoOverlapd1a = V4IsGrtrV32u(originDiffd1a, eps); + + // D2 next (35 instr) + // 3i + Vec4V d2xa = V4Sub(p1OBBxa, p2ABBxa), d2ya = V4Sub(p1OBBya, p2ABBya), d2za = V4Sub(p1OBBza, p2ABBza); + // for AABB compute projections of extents and center -- 6 + Vec4V abbExd2Prja = V4MulAdd(d2xa, abbExa, zeroes); + Vec4V abbEyd2Prja = V4MulAdd(d2ya, abbEya, zeroes); + Vec4V abbEzd2Prja = V4MulAdd(d2za, abbEza, zeroes); + Vec4V abbCd2Prja = V4MulAdd(d2xa, abbCxa, V4MulAdd(d2ya, abbCya, V4MulAdd(d2za, abbCza, zeroes))); + // for obb project each halfaxis and origin and add abs values of half-axis projections -- 12i + Vec4V obbXEd2Prja = V4MulAdd(d2xa, obbXESplatX, V4MulAdd(d2ya, obbXESplatY, V4MulAdd(d2za, obbXESplatZ, zeroes))); + Vec4V obbYEd2Prja = V4MulAdd(d2xa, obbYESplatX, V4MulAdd(d2ya, obbYESplatY, V4MulAdd(d2za, obbYESplatZ, zeroes))); + Vec4V obbZEd2Prja = V4MulAdd(d2xa, obbZESplatX, V4MulAdd(d2ya, obbZESplatY, V4MulAdd(d2za, obbZESplatZ, zeroes))); + Vec4V obbOd2Prja = V4MulAdd(d2xa, obbOSplatX, V4MulAdd(d2ya, obbOSplatY, V4MulAdd(d2za, obbOSplatZ, zeroes))); + // compare lengths between projected centers with sum of projected radii -- 16i + Vec4V originDiffd2a = v_absm(V4Sub(abbCd2Prja, obbOd2Prja)); + Vec4V absABBRd2a = V4Add(V4Add(v_absm(abbExd2Prja), v_absm(abbEyd2Prja)), v_absm(abbEzd2Prja)); + Vec4V absOBBRd2a = V4Add(V4Add(v_absm(obbXEd2Prja), v_absm(obbYEd2Prja)), v_absm(obbZEd2Prja)); + VecU32V noOverlapd2a = V4IsGrtrV32u(V4Sub(originDiffd2a, eps), V4Add(absABBRd2a, absOBBRd2a)); + VecU32V epsNoOverlapd2a = V4IsGrtrV32u(originDiffd2a, eps); + + // 8i + noOverlapa = V4U32or(V4U32and(noOverlapd1a, epsNoOverlapd1a), V4U32and(noOverlapd2a, epsNoOverlapd2a)); + VecU32V ignore4a = V4IsGrtrV32u(minx4a, maxx4a); // 1 if degenerate box (empty slot) + noOverlapa = V4U32or(noOverlapa, ignore4a); + resa4u = V4U32Andc(U4Load(1), noOverlapa); // 1 & ~noOverlap + V4U32StoreAligned(resa4u, reinterpret_cast<VecU32V*>(resa_)); + ///// 8+16+12+6+3+16+12+6+3+9+6+9+6+12+6+6=136i from load to result + } + + cacheTopValid = false; + for (PxU32 i = 0; i < 4; i++) + { + PxU32 ptr = ptrs[i+offs] & ~1; // clear the isLeaf bit + if (resa_[i]) + { + if (tn->isLeaf(i)) + { + if (!callback->processResults(1, &ptr)) + return; + } + else + { + *(stackPtr++) = ptr; + cacheTop = ptr; + cacheTopValid = true; + } + } + } + } while (stackPtr > stack); +} + +} // namespace Gu + +} |