Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
1 files changed, 581 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp b/PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp
new file mode 100644
index 00000000..9d7bd57a
--- /dev/null
+++ b/PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp
@@ -0,0 +1,581 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+/*
+General notes:
+
+	rtree depth-first traversal looks like this:
+	push top level page onto stack
+
+	pop page from stack
+	for each node in page
+	  if node overlaps with testrect
+	    push node's subpage
+
+	we want to efficiently keep track of current stack level to know if the current page is a leaf or not
+	(since we don't store a flag with the page due to no space, we can't determine it just by looking at current page)
+	since we traverse depth first, the levels for nodes on the stack look like this:
+	l0 l0 l1 l2 l2 l3 l3 l3 l4
+
+	we can encode this as an array of 4 bits per level count into a 32-bit integer
+	to simplify the code->level computation we also keep track of current level by incrementing the level whenever any subpages
+	from current test page are pushed onto the stack
+	when we pop a page off the stack we use this encoding to determine if we should decrement the stack level
+*/
+
+#include "foundation/PxBounds3.h"
+#include "GuRTree.h"
+#include "PsIntrinsics.h"
+#include "GuBox.h"
+#include "PsVecMath.h"
+#include "PxQueryReport.h" // for PxAgain
+#include "PsBitUtils.h"
+
+//#define VERIFY_RTREE
+#ifdef VERIFY_RTREE
+#include "GuIntersectionRayBox.h"
+#include "GuIntersectionBoxBox.h"
+#include "stdio.h"
+#endif
+
+using namespace physx;
+using namespace physx::shdfnd;
+using namespace Ps::aos;
+
+namespace physx
+{
+namespace Gu {
+
+using namespace Ps::aos;
+
+#define v_absm(a) V4Andc(a, signMask)
+#define V4FromF32A(x) V4LoadA(x)
+#define PxF32FV(x) FStore(x)
+#define CAST_U8(a) reinterpret_cast<PxU8*>(a)
+
+/////////////////////////////////////////////////////////////////////////
+void RTree::traverseAABB(const PxVec3& boxMin, const PxVec3& boxMax, const PxU32 maxResults, PxU32* resultsPtr, Callback* callback) const
+{
+	PX_UNUSED(resultsPtr);
+
+	PX_ASSERT(callback);
+	PX_ASSERT(maxResults >= mPageSize);
+	PX_UNUSED(maxResults);
+
+	const PxU32 maxStack = 128;
+	PxU32 stack1[maxStack];
+	PxU32* stack = stack1+1;
+
+	PX_ASSERT(mPages);
+	PX_ASSERT((uintptr_t(mPages) & 127) == 0);
+	PX_ASSERT((uintptr_t(this) & 15) == 0);
+
+	// conservatively quantize the input box
+	Vec4V nqMin = Vec4V_From_PxVec3_WUndefined(boxMin);
+	Vec4V nqMax = Vec4V_From_PxVec3_WUndefined(boxMax);
+
+	Vec4V nqMinx4 = V4SplatElement<0>(nqMin);
+	Vec4V nqMiny4 = V4SplatElement<1>(nqMin);
+	Vec4V nqMinz4 = V4SplatElement<2>(nqMin);
+	Vec4V nqMaxx4 = V4SplatElement<0>(nqMax);
+	Vec4V nqMaxy4 = V4SplatElement<1>(nqMax);
+	Vec4V nqMaxz4 = V4SplatElement<2>(nqMax);
+
+	// on 64-bit platforms the dynamic rtree pointer is also relative to mPages
+	PxU8* treeNodes8 = CAST_U8(mPages);
+	PxU32* stackPtr = stack;
+
+	// AP potential perf optimization - fetch the top level right away
+	PX_ASSERT(RTREE_N == 4 || RTREE_N == 8);
+	PX_ASSERT(Ps::isPowerOfTwo(mPageSize));
+
+	for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --)
+		*stackPtr++ = j*sizeof(RTreePage);
+
+	PxU32 cacheTopValid = true;
+	PxU32 cacheTop = 0;
+
+	do {
+		stackPtr--;
+		PxU32 top;
+		if (cacheTopValid) // branch is faster than lhs
+			top = cacheTop;
+		else
+			top = stackPtr[0];
+		PX_ASSERT(!cacheTopValid || stackPtr[0] == cacheTop);
+		RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top);
+		const PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs;
+
+		Vec4V minx4 = V4LoadA(tn->minx);
+		Vec4V miny4 = V4LoadA(tn->miny);
+		Vec4V minz4 = V4LoadA(tn->minz);
+		Vec4V maxx4 = V4LoadA(tn->maxx);
+		Vec4V maxy4 = V4LoadA(tn->maxy);
+		Vec4V maxz4 = V4LoadA(tn->maxz);
+
+		// AABB/AABB overlap test
+		BoolV res0 = V4IsGrtr(nqMinx4, maxx4); BoolV res1 = V4IsGrtr(nqMiny4, maxy4); BoolV res2 = V4IsGrtr(nqMinz4, maxz4);
+		BoolV res3 = V4IsGrtr(minx4, nqMaxx4); BoolV res4 = V4IsGrtr(miny4, nqMaxy4); BoolV res5 = V4IsGrtr(minz4, nqMaxz4);
+		BoolV resx = BOr(BOr(BOr(res0, res1), BOr(res2, res3)), BOr(res4, res5));
+		PX_ALIGN_PREFIX(16) PxU32 resa[RTREE_N] PX_ALIGN_SUFFIX(16);
+
+		VecU32V res4x = VecU32V_From_BoolV(resx); 
+		U4StoreA(res4x, resa);
+
+		cacheTopValid = false;
+		for (PxU32 i = 0; i < RTREE_N; i++)
+		{
+			PxU32 ptr = ptrs[i] & ~1; // clear the isLeaf bit
+			if (resa[i])
+				continue;
+			if (tn->isLeaf(i))
+			{
+				if (!callback->processResults(1, &ptr))
+					return;
+			}
+			else
+			{
+				*(stackPtr++) = ptr;
+				cacheTop = ptr;
+				cacheTopValid = true;
+			}
+		}
+	} while (stackPtr > stack);
+}
+
+namespace
+{
+	const VecU32V signMask = U4LoadXYZW((PxU32(1)<<31), (PxU32(1)<<31), (PxU32(1)<<31), (PxU32(1)<<31));
+	const Vec4V epsFloat4 = V4Load(1e-9f);
+	const Vec4V zeroes = V4Zero();
+	const Vec4V twos = V4Load(2.0f);
+	const Vec4V epsInflateFloat4 = V4Load(1e-7f);
+}
+
+/////////////////////////////////////////////////////////////////////////
+template <int inflate>
+void RTree::traverseRay(
+	const PxVec3& rayOrigin, const PxVec3& rayDir,
+	const PxU32 maxResults, PxU32* resultsPtr, Gu::RTree::CallbackRaycast* callback,
+	const PxVec3* fattenAABBs, PxF32 maxT) const
+{
+	// implements Kay-Kajiya 4-way SIMD test
+	PX_UNUSED(resultsPtr);
+	PX_UNUSED(maxResults);
+
+	const PxU32 maxStack = 128;
+	PxU32 stack1[maxStack];
+	PxU32* stack = stack1+1;
+
+	PX_ASSERT(mPages);
+	PX_ASSERT((uintptr_t(mPages) & 127) == 0);
+	PX_ASSERT((uintptr_t(this) & 15) == 0);
+
+	PxU8* treeNodes8 = CAST_U8(mPages);
+
+	Vec4V fattenAABBsX, fattenAABBsY, fattenAABBsZ;
+	PX_UNUSED(fattenAABBsX); PX_UNUSED(fattenAABBsY); PX_UNUSED(fattenAABBsZ);
+	if (inflate)
+	{
+		Vec4V fattenAABBs4 = Vec4V_From_PxVec3_WUndefined(*fattenAABBs);
+		fattenAABBs4 = V4Add(fattenAABBs4, epsInflateFloat4); // US2385 - shapes are "closed" meaning exactly touching shapes should report overlap
+		fattenAABBsX = V4SplatElement<0>(fattenAABBs4);
+		fattenAABBsY = V4SplatElement<1>(fattenAABBs4);
+		fattenAABBsZ = V4SplatElement<2>(fattenAABBs4);
+	}
+
+	Vec4V maxT4;
+	maxT4 = V4Load(maxT);
+	Vec4V rayP = Vec4V_From_PxVec3_WUndefined(rayOrigin);
+	Vec4V rayD = Vec4V_From_PxVec3_WUndefined(rayDir);
+	VecU32V raySign = V4U32and(VecU32V_ReinterpretFrom_Vec4V(rayD), signMask);
+	Vec4V rayDAbs = V4Abs(rayD); // abs value of rayD
+	Vec4V rayInvD = Vec4V_ReinterpretFrom_VecU32V(V4U32or(raySign, VecU32V_ReinterpretFrom_Vec4V(V4Max(rayDAbs, epsFloat4)))); // clamp near-zero components up to epsilon
+	rayD = rayInvD;
+
+	//rayInvD = V4Recip(rayInvD);
+	// Newton-Raphson iteration for reciprocal (see wikipedia):
+	// X[n+1] = X[n]*(2-original*X[n]), X[0] = V4RecipFast estimate
+	//rayInvD = rayInvD*(twos-rayD*rayInvD);
+	rayInvD = V4RecipFast(rayInvD); // initial estimate, not accurate enough
+	rayInvD = V4Mul(rayInvD, V4NegMulSub(rayD, rayInvD, twos));
+
+	// P+tD=a; t=(a-P)/D
+	// t=(a - p.x)*1/d.x = a/d.x +(- p.x/d.x)
+	Vec4V rayPinvD = V4NegMulSub(rayInvD, rayP, zeroes);
+	Vec4V rayInvDsplatX = V4SplatElement<0>(rayInvD);
+	Vec4V rayInvDsplatY = V4SplatElement<1>(rayInvD);
+	Vec4V rayInvDsplatZ = V4SplatElement<2>(rayInvD);
+	Vec4V rayPinvDsplatX = V4SplatElement<0>(rayPinvD);
+	Vec4V rayPinvDsplatY = V4SplatElement<1>(rayPinvD);
+	Vec4V rayPinvDsplatZ = V4SplatElement<2>(rayPinvD);
+
+	PX_ASSERT(RTREE_N == 4 || RTREE_N == 8);
+	PX_ASSERT(mNumRootPages > 0);
+
+	PxU32 stackPtr = 0;
+	for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --)
+		stack[stackPtr++] = j*sizeof(RTreePage);
+
+	PX_ALIGN_PREFIX(16) PxU32 resa[4] PX_ALIGN_SUFFIX(16);
+
+	while (stackPtr)
+	{
+		PxU32 top = stack[--stackPtr];
+		if (top&1) // isLeaf test
+		{
+			top--;
+			PxF32 newMaxT = maxT;
+			if (!callback->processResults(1, &top, newMaxT))
+				return;
+			/* shrink the ray if newMaxT is reduced compared to the original maxT */
+			if (maxT != newMaxT)
+			{
+				PX_ASSERT(newMaxT < maxT);
+				maxT = newMaxT;
+				maxT4 = V4Load(newMaxT);
+			}
+			continue;
+		}
+
+		RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top);
+		
+		// 6i load
+		Vec4V minx4a = V4LoadA(tn->minx), miny4a = V4LoadA(tn->miny), minz4a = V4LoadA(tn->minz);
+		Vec4V maxx4a = V4LoadA(tn->maxx), maxy4a = V4LoadA(tn->maxy), maxz4a = V4LoadA(tn->maxz);
+
+		// 1i disabled test
+		// AP scaffold - optimization opportunity - can save 2 instructions here
+		VecU32V ignore4a = V4IsGrtrV32u(minx4a, maxx4a); // 1 if degenerate box (empty slot in the page)
+
+		if (inflate)
+		{
+			// 6i
+			maxx4a = V4Add(maxx4a, fattenAABBsX); maxy4a = V4Add(maxy4a, fattenAABBsY); maxz4a = V4Add(maxz4a, fattenAABBsZ);
+			minx4a = V4Sub(minx4a, fattenAABBsX); miny4a = V4Sub(miny4a, fattenAABBsY); minz4a = V4Sub(minz4a, fattenAABBsZ);
+		}
+
+		// P+tD=a; t=(a-P)/D
+		// t=(a - p.x)*1/d.x = a/d.x +(- p.x/d.x)
+		// 6i
+		Vec4V tminxa0 = V4MulAdd(minx4a, rayInvDsplatX, rayPinvDsplatX);
+		Vec4V tminya0 = V4MulAdd(miny4a, rayInvDsplatY, rayPinvDsplatY);
+		Vec4V tminza0 = V4MulAdd(minz4a, rayInvDsplatZ, rayPinvDsplatZ);
+		Vec4V tmaxxa0 = V4MulAdd(maxx4a, rayInvDsplatX, rayPinvDsplatX);
+		Vec4V tmaxya0 = V4MulAdd(maxy4a, rayInvDsplatY, rayPinvDsplatY);
+		Vec4V tmaxza0 = V4MulAdd(maxz4a, rayInvDsplatZ, rayPinvDsplatZ);
+
+		// test half-spaces
+		// P+tD=dN
+		// t = (d(N,D)-(P,D))/(D,D) , (D,D)=1
+
+		// compute 4x dot products (N,D) and (P,N) for each AABB in the page
+
+		// 6i
+		// now compute tnear and tfar for each pair of planes for each box
+		Vec4V tminxa = V4Min(tminxa0, tmaxxa0); Vec4V tmaxxa = V4Max(tminxa0, tmaxxa0);
+		Vec4V tminya = V4Min(tminya0, tmaxya0); Vec4V tmaxya = V4Max(tminya0, tmaxya0);
+		Vec4V tminza = V4Min(tminza0, tmaxza0); Vec4V tmaxza = V4Max(tminza0, tmaxza0);
+
+		// 8i
+		Vec4V maxOfNeasa = V4Max(V4Max(tminxa, tminya), tminza);
+		Vec4V minOfFarsa = V4Min(V4Min(tmaxxa, tmaxya), tmaxza);
+		ignore4a = V4U32or(ignore4a, V4IsGrtrV32u(epsFloat4, minOfFarsa));  // if tfar is negative, ignore since its a ray, not a line
+		// AP scaffold: update the build to eliminate 3 more instructions for ignore4a above
+		//VecU32V ignore4a = V4IsGrtrV32u(epsFloat4, minOfFarsa);  // if tfar is negative, ignore since its a ray, not a line
+		ignore4a = V4U32or(ignore4a, V4IsGrtrV32u(maxOfNeasa, maxT4));  // if tnear is over maxT, ignore this result
+
+		// 2i
+		VecU32V resa4 = V4IsGrtrV32u(maxOfNeasa, minOfFarsa); // if 1 => fail
+		resa4 = V4U32or(resa4, ignore4a);
+
+		// 1i
+		V4U32StoreAligned(resa4, reinterpret_cast<VecU32V*>(resa));
+
+		PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs;
+
+		stack[stackPtr] = ptrs[0]; stackPtr += (1+resa[0]); // AP scaffold TODO: use VecU32add
+		stack[stackPtr] = ptrs[1]; stackPtr += (1+resa[1]);
+		stack[stackPtr] = ptrs[2]; stackPtr += (1+resa[2]);
+		stack[stackPtr] = ptrs[3]; stackPtr += (1+resa[3]);
+	}
+}
+
+template void RTree::traverseRay<0>(
+	const PxVec3&, const PxVec3&, const PxU32, PxU32*, Gu::RTree::CallbackRaycast*, const PxVec3*, PxF32 maxT) const;
+template void RTree::traverseRay<1>(
+	const PxVec3&, const PxVec3&, const PxU32, PxU32*, Gu::RTree::CallbackRaycast*, const PxVec3*, PxF32 maxT) const;
+
+/////////////////////////////////////////////////////////////////////////
+void RTree::traverseOBB(
+	const Gu::Box& obb, const PxU32 maxResults, PxU32* resultsPtr, Gu::RTree::Callback* callback) const
+{
+	PX_UNUSED(resultsPtr);
+	PX_UNUSED(maxResults);
+
+	const PxU32 maxStack = 128;
+	PxU32 stack[maxStack];
+
+	PX_ASSERT(mPages);
+	PX_ASSERT((uintptr_t(mPages) & 127) == 0);
+	PX_ASSERT((uintptr_t(this) & 15) == 0);
+
+	PxU8* treeNodes8 = CAST_U8(mPages);
+	PxU32* stackPtr = stack;
+
+	Vec4V ones, halves, eps;
+	ones = V4Load(1.0f);
+	halves = V4Load(0.5f);
+	eps = V4Load(1e-6f);
+	
+	PX_UNUSED(ones);
+
+	Vec4V obbO = Vec4V_From_PxVec3_WUndefined(obb.center);
+	Vec4V obbE = Vec4V_From_PxVec3_WUndefined(obb.extents);
+	// Gu::Box::rot matrix columns are the OBB axes
+	Vec4V obbX = Vec4V_From_PxVec3_WUndefined(obb.rot.column0);
+	Vec4V obbY = Vec4V_From_PxVec3_WUndefined(obb.rot.column1);
+	Vec4V obbZ = Vec4V_From_PxVec3_WUndefined(obb.rot.column2);
+
+#if PX_WINDOWS || PX_XBOXONE
+	// Visual Studio compiler hangs with #defines
+	// On VMX platforms we use #defines in the other branch of this #ifdef to avoid register spills (LHS)
+	Vec4V obbESplatX = V4SplatElement<0>(obbE);
+	Vec4V obbESplatY = V4SplatElement<1>(obbE);
+	Vec4V obbESplatZ = V4SplatElement<2>(obbE);
+	Vec4V obbESplatNegX = V4Sub(zeroes, obbESplatX);
+	Vec4V obbESplatNegY = V4Sub(zeroes, obbESplatY);
+	Vec4V obbESplatNegZ = V4Sub(zeroes, obbESplatZ);
+	Vec4V obbXE = V4MulAdd(obbX, obbESplatX, zeroes); // scale axii by E
+	Vec4V obbYE = V4MulAdd(obbY, obbESplatY, zeroes); // scale axii by E
+	Vec4V obbZE = V4MulAdd(obbZ, obbESplatZ, zeroes); // scale axii by E
+	Vec4V obbOSplatX = V4SplatElement<0>(obbO);
+	Vec4V obbOSplatY = V4SplatElement<1>(obbO);
+	Vec4V obbOSplatZ = V4SplatElement<2>(obbO);
+	Vec4V obbXSplatX = V4SplatElement<0>(obbX);
+	Vec4V obbXSplatY = V4SplatElement<1>(obbX);
+	Vec4V obbXSplatZ = V4SplatElement<2>(obbX);
+	Vec4V obbYSplatX = V4SplatElement<0>(obbY);
+	Vec4V obbYSplatY = V4SplatElement<1>(obbY);
+	Vec4V obbYSplatZ = V4SplatElement<2>(obbY);
+	Vec4V obbZSplatX = V4SplatElement<0>(obbZ);
+	Vec4V obbZSplatY = V4SplatElement<1>(obbZ);
+	Vec4V obbZSplatZ = V4SplatElement<2>(obbZ);
+	Vec4V obbXESplatX = V4SplatElement<0>(obbXE);
+	Vec4V obbXESplatY = V4SplatElement<1>(obbXE);
+	Vec4V obbXESplatZ = V4SplatElement<2>(obbXE);
+	Vec4V obbYESplatX = V4SplatElement<0>(obbYE);
+	Vec4V obbYESplatY = V4SplatElement<1>(obbYE);
+	Vec4V obbYESplatZ = V4SplatElement<2>(obbYE);
+	Vec4V obbZESplatX = V4SplatElement<0>(obbZE);
+	Vec4V obbZESplatY = V4SplatElement<1>(obbZE);
+	Vec4V obbZESplatZ = V4SplatElement<2>(obbZE);
+#else
+	#define obbESplatX V4SplatElement<0>(obbE)
+	#define obbESplatY V4SplatElement<1>(obbE)
+	#define obbESplatZ V4SplatElement<2>(obbE)
+	#define obbESplatNegX V4Sub(zeroes, obbESplatX)
+	#define obbESplatNegY V4Sub(zeroes, obbESplatY)
+	#define obbESplatNegZ V4Sub(zeroes, obbESplatZ)
+	#define obbXE V4MulAdd(obbX, obbESplatX, zeroes)
+	#define obbYE V4MulAdd(obbY, obbESplatY, zeroes)
+	#define obbZE V4MulAdd(obbZ, obbESplatZ, zeroes)
+	#define obbOSplatX V4SplatElement<0>(obbO)
+	#define obbOSplatY V4SplatElement<1>(obbO)
+	#define obbOSplatZ V4SplatElement<2>(obbO)
+	#define obbXSplatX V4SplatElement<0>(obbX)
+	#define obbXSplatY V4SplatElement<1>(obbX)
+	#define obbXSplatZ V4SplatElement<2>(obbX)
+	#define obbYSplatX V4SplatElement<0>(obbY)
+	#define obbYSplatY V4SplatElement<1>(obbY)
+	#define obbYSplatZ V4SplatElement<2>(obbY)
+	#define obbZSplatX V4SplatElement<0>(obbZ)
+	#define obbZSplatY V4SplatElement<1>(obbZ)
+	#define obbZSplatZ V4SplatElement<2>(obbZ)
+	#define obbXESplatX V4SplatElement<0>(obbXE)
+	#define obbXESplatY V4SplatElement<1>(obbXE)
+	#define obbXESplatZ V4SplatElement<2>(obbXE)
+	#define obbYESplatX V4SplatElement<0>(obbYE)
+	#define obbYESplatY V4SplatElement<1>(obbYE)
+	#define obbYESplatZ V4SplatElement<2>(obbYE)
+	#define obbZESplatX V4SplatElement<0>(obbZE)
+	#define obbZESplatY V4SplatElement<1>(obbZE)
+	#define obbZESplatZ V4SplatElement<2>(obbZE)
+#endif
+
+	PX_ASSERT(mPageSize == 4 || mPageSize == 8);
+	PX_ASSERT(mNumRootPages > 0);
+
+	for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --)
+		*stackPtr++ = j*sizeof(RTreePage);
+	PxU32 cacheTopValid = true;
+	PxU32 cacheTop = 0;
+
+	PX_ALIGN_PREFIX(16) PxU32 resa_[4] PX_ALIGN_SUFFIX(16);
+
+	do {
+		stackPtr--;
+
+		PxU32 top;
+		if (cacheTopValid) // branch is faster than lhs
+			top = cacheTop;
+		else
+			top = stackPtr[0];
+		PX_ASSERT(!cacheTopValid || top == cacheTop);
+		RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top);
+		
+		const PxU32 offs = 0;
+		PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs;
+
+		// 6i
+		Vec4V minx4a = V4LoadA(tn->minx+offs);
+		Vec4V miny4a = V4LoadA(tn->miny+offs);
+		Vec4V minz4a = V4LoadA(tn->minz+offs);
+		Vec4V maxx4a = V4LoadA(tn->maxx+offs);
+		Vec4V maxy4a = V4LoadA(tn->maxy+offs);
+		Vec4V maxz4a = V4LoadA(tn->maxz+offs);
+
+		VecU32V noOverlapa;
+		VecU32V resa4u;
+		{
+			// PRECOMPUTE FOR A BLOCK
+			// 109 instr per 4 OBB/AABB
+			// ABB iteration 1, start with OBB origin as other point -- 6
+			Vec4V p1ABBxa = V4Max(minx4a, V4Min(maxx4a, obbOSplatX));
+			Vec4V p1ABBya = V4Max(miny4a, V4Min(maxy4a, obbOSplatY));
+			Vec4V p1ABBza = V4Max(minz4a, V4Min(maxz4a, obbOSplatZ));
+
+			// OBB iteration 1, move to OBB space first -- 12
+			Vec4V p1ABBOxa = V4Sub(p1ABBxa, obbOSplatX);
+			Vec4V p1ABBOya = V4Sub(p1ABBya, obbOSplatY);
+			Vec4V p1ABBOza = V4Sub(p1ABBza, obbOSplatZ);
+			Vec4V obbPrjXa = V4MulAdd(p1ABBOxa, obbXSplatX, V4MulAdd(p1ABBOya, obbXSplatY, V4MulAdd(p1ABBOza, obbXSplatZ, zeroes)));
+			Vec4V obbPrjYa = V4MulAdd(p1ABBOxa, obbYSplatX, V4MulAdd(p1ABBOya, obbYSplatY, V4MulAdd(p1ABBOza, obbYSplatZ, zeroes)));
+			Vec4V obbPrjZa = V4MulAdd(p1ABBOxa, obbZSplatX, V4MulAdd(p1ABBOya, obbZSplatY, V4MulAdd(p1ABBOza, obbZSplatZ, zeroes)));
+			// clamp AABB point in OBB space to OBB extents. Since we scaled the axii, the extents are [-1,1] -- 6
+			Vec4V pOBBxa = V4Max(obbESplatNegX, V4Min(obbPrjXa, obbESplatX));
+			Vec4V pOBBya = V4Max(obbESplatNegY, V4Min(obbPrjYa, obbESplatY));
+			Vec4V pOBBza = V4Max(obbESplatNegZ, V4Min(obbPrjZa, obbESplatZ));
+			// go back to AABB space. we have x,y,z in obb space, need to multiply by axii -- 9
+			Vec4V p1OBBxa = V4MulAdd(pOBBxa, obbXSplatX, V4MulAdd(pOBBya, obbYSplatX, V4MulAdd(pOBBza, obbZSplatX, obbOSplatX)));
+			Vec4V p1OBBya = V4MulAdd(pOBBxa, obbXSplatY, V4MulAdd(pOBBya, obbYSplatY, V4MulAdd(pOBBza, obbZSplatY, obbOSplatY)));
+			Vec4V p1OBBza = V4MulAdd(pOBBxa, obbXSplatZ, V4MulAdd(pOBBya, obbYSplatZ, V4MulAdd(pOBBza, obbZSplatZ, obbOSplatZ)));
+
+			// ABB iteration 2 -- 6 instructions
+			Vec4V p2ABBxa = V4Max(minx4a, V4Min(maxx4a, p1OBBxa));
+			Vec4V p2ABBya = V4Max(miny4a, V4Min(maxy4a, p1OBBya));
+			Vec4V p2ABBza = V4Max(minz4a, V4Min(maxz4a, p1OBBza));
+			// above blocks add up to 12+12+15=39 instr
+			// END PRECOMPUTE FOR A BLOCK
+
+			// for AABBs precompute extents and center -- 9i
+			Vec4V abbCxa = V4MulAdd(V4Add(maxx4a, minx4a), halves, zeroes);
+			Vec4V abbCya = V4MulAdd(V4Add(maxy4a, miny4a), halves, zeroes);
+			Vec4V abbCza = V4MulAdd(V4Add(maxz4a, minz4a), halves, zeroes);
+			Vec4V abbExa = V4Sub(maxx4a, abbCxa);
+			Vec4V abbEya = V4Sub(maxy4a, abbCya);
+			Vec4V abbEza = V4Sub(maxz4a, abbCza);
+
+			// now test separating axes D1 = p1OBB-p1ABB and D2 = p1OBB-p2ABB -- 37 instructions per axis
+			// D1 first -- 3 instructions
+			Vec4V d1xa = V4Sub(p1OBBxa, p1ABBxa), d1ya = V4Sub(p1OBBya, p1ABBya), d1za = V4Sub(p1OBBza, p1ABBza);
+
+			// for AABB compute projections of extents and center -- 6
+			Vec4V abbExd1Prja = V4MulAdd(d1xa, abbExa, zeroes);
+			Vec4V abbEyd1Prja = V4MulAdd(d1ya, abbEya, zeroes);
+			Vec4V abbEzd1Prja = V4MulAdd(d1za, abbEza, zeroes);
+			Vec4V abbCd1Prja = V4MulAdd(d1xa, abbCxa, V4MulAdd(d1ya, abbCya, V4MulAdd(d1za, abbCza, zeroes)));
+
+			// for obb project each halfaxis and origin and add abs values of half-axis projections -- 12 instructions
+			Vec4V obbXEd1Prja = V4MulAdd(d1xa, obbXESplatX, V4MulAdd(d1ya, obbXESplatY, V4MulAdd(d1za, obbXESplatZ, zeroes)));
+			Vec4V obbYEd1Prja = V4MulAdd(d1xa, obbYESplatX, V4MulAdd(d1ya, obbYESplatY, V4MulAdd(d1za, obbYESplatZ, zeroes)));
+			Vec4V obbZEd1Prja = V4MulAdd(d1xa, obbZESplatX, V4MulAdd(d1ya, obbZESplatY, V4MulAdd(d1za, obbZESplatZ, zeroes)));
+			Vec4V obbOd1Prja = V4MulAdd(d1xa, obbOSplatX, V4MulAdd(d1ya, obbOSplatY, V4MulAdd(d1za, obbOSplatZ, zeroes)));
+
+			// compare lengths between projected centers with sum of projected radii -- 16i
+			Vec4V originDiffd1a = v_absm(V4Sub(abbCd1Prja, obbOd1Prja));
+			Vec4V absABBRd1a = V4Add(V4Add(v_absm(abbExd1Prja), v_absm(abbEyd1Prja)), v_absm(abbEzd1Prja));
+			Vec4V absOBBRd1a = V4Add(V4Add(v_absm(obbXEd1Prja), v_absm(obbYEd1Prja)), v_absm(obbZEd1Prja));
+			VecU32V noOverlapd1a = V4IsGrtrV32u(V4Sub(originDiffd1a, eps), V4Add(absABBRd1a, absOBBRd1a));
+			VecU32V epsNoOverlapd1a = V4IsGrtrV32u(originDiffd1a, eps);
+
+			// D2 next (35 instr)
+			// 3i
+			Vec4V d2xa = V4Sub(p1OBBxa, p2ABBxa), d2ya = V4Sub(p1OBBya, p2ABBya), d2za = V4Sub(p1OBBza, p2ABBza);
+			// for AABB compute projections of extents and center -- 6
+			Vec4V abbExd2Prja = V4MulAdd(d2xa, abbExa, zeroes);
+			Vec4V abbEyd2Prja = V4MulAdd(d2ya, abbEya, zeroes);
+			Vec4V abbEzd2Prja = V4MulAdd(d2za, abbEza, zeroes);
+			Vec4V abbCd2Prja = V4MulAdd(d2xa, abbCxa, V4MulAdd(d2ya, abbCya, V4MulAdd(d2za, abbCza, zeroes)));
+			// for obb project each halfaxis and origin and add abs values of half-axis projections -- 12i
+			Vec4V obbXEd2Prja = V4MulAdd(d2xa, obbXESplatX, V4MulAdd(d2ya, obbXESplatY, V4MulAdd(d2za, obbXESplatZ, zeroes)));
+			Vec4V obbYEd2Prja = V4MulAdd(d2xa, obbYESplatX, V4MulAdd(d2ya, obbYESplatY, V4MulAdd(d2za, obbYESplatZ, zeroes)));
+			Vec4V obbZEd2Prja = V4MulAdd(d2xa, obbZESplatX, V4MulAdd(d2ya, obbZESplatY, V4MulAdd(d2za, obbZESplatZ, zeroes)));
+			Vec4V obbOd2Prja = V4MulAdd(d2xa, obbOSplatX, V4MulAdd(d2ya, obbOSplatY, V4MulAdd(d2za, obbOSplatZ, zeroes)));
+			// compare lengths between projected centers with sum of projected radii -- 16i
+			Vec4V originDiffd2a = v_absm(V4Sub(abbCd2Prja, obbOd2Prja));
+			Vec4V absABBRd2a = V4Add(V4Add(v_absm(abbExd2Prja), v_absm(abbEyd2Prja)), v_absm(abbEzd2Prja));
+			Vec4V absOBBRd2a = V4Add(V4Add(v_absm(obbXEd2Prja), v_absm(obbYEd2Prja)), v_absm(obbZEd2Prja));
+			VecU32V noOverlapd2a = V4IsGrtrV32u(V4Sub(originDiffd2a, eps), V4Add(absABBRd2a, absOBBRd2a));
+			VecU32V epsNoOverlapd2a = V4IsGrtrV32u(originDiffd2a, eps);
+
+			// 8i
+			noOverlapa = V4U32or(V4U32and(noOverlapd1a, epsNoOverlapd1a), V4U32and(noOverlapd2a, epsNoOverlapd2a));
+			VecU32V ignore4a = V4IsGrtrV32u(minx4a, maxx4a); // 1 if degenerate box (empty slot)
+			noOverlapa = V4U32or(noOverlapa, ignore4a);
+			resa4u = V4U32Andc(U4Load(1), noOverlapa); // 1 & ~noOverlap
+			V4U32StoreAligned(resa4u, reinterpret_cast<VecU32V*>(resa_));
+			///// 8+16+12+6+3+16+12+6+3+9+6+9+6+12+6+6=136i from load to result
+		}
+
+		cacheTopValid = false;
+		for (PxU32 i = 0; i < 4; i++)
+		{
+			PxU32 ptr = ptrs[i+offs] & ~1; // clear the isLeaf bit
+			if (resa_[i])
+			{
+				if (tn->isLeaf(i))
+				{
+					if (!callback->processResults(1, &ptr))
+						return;
+				}
+				else
+				{
+					*(stackPtr++) = ptr;
+					cacheTop = ptr;
+					cacheTopValid = true;
+				}
+			}
+		}
+	} while (stackPtr > stack);
+}
+
+} // namespace Gu
+
+}
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip