PhysX_3.4/Source/GeomUtils/src/mesh/GuRTreeQueries.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573

//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  

/*
General notes:

	rtree depth-first traversal looks like this:
	push top level page onto stack

	pop page from stack
	for each node in page
	  if node overlaps with testrect
	    push node's subpage

	we want to efficiently keep track of current stack level to know if the current page is a leaf or not
	(since we don't store a flag with the page due to no space, we can't determine it just by looking at current page)
	since we traverse depth first, the levels for nodes on the stack look like this:
	l0 l0 l1 l2 l2 l3 l3 l3 l4

	we can encode this as an array of 4 bits per level count into a 32-bit integer
	to simplify the code->level computation we also keep track of current level by incrementing the level whenever any subpages
	from current test page are pushed onto the stack
	when we pop a page off the stack we use this encoding to determine if we should decrement the stack level
*/

#include "foundation/PxBounds3.h"
#include "GuRTree.h"
#include "PsIntrinsics.h"
#include "GuBox.h"
#include "PsVecMath.h"
#include "PxQueryReport.h" // for PxAgain
#include "PsBitUtils.h"
#include "GuBVConstants.h"

//#define VERIFY_RTREE
#ifdef VERIFY_RTREE
#include "GuIntersectionRayBox.h"
#include "GuIntersectionBoxBox.h"
#include "stdio.h"
#endif

using namespace physx;
using namespace physx::shdfnd;
using namespace Ps::aos;

namespace physx
{
namespace Gu {

using namespace Ps::aos;

#define v_absm(a) V4Andc(a, signMask)
#define V4FromF32A(x) V4LoadA(x)
#define PxF32FV(x) FStore(x)
#define CAST_U8(a) reinterpret_cast<PxU8*>(a)

/////////////////////////////////////////////////////////////////////////
void RTree::traverseAABB(const PxVec3& boxMin, const PxVec3& boxMax, const PxU32 maxResults, PxU32* resultsPtr, Callback* callback) const
{
	PX_UNUSED(resultsPtr);

	PX_ASSERT(callback);
	PX_ASSERT(maxResults >= mPageSize);
	PX_UNUSED(maxResults);

	const PxU32 maxStack = 128;
	PxU32 stack1[maxStack];
	PxU32* stack = stack1+1;

	PX_ASSERT(mPages);
	PX_ASSERT((uintptr_t(mPages) & 127) == 0);
	PX_ASSERT((uintptr_t(this) & 15) == 0);

	// conservatively quantize the input box
	Vec4V nqMin = Vec4V_From_PxVec3_WUndefined(boxMin);
	Vec4V nqMax = Vec4V_From_PxVec3_WUndefined(boxMax);

	Vec4V nqMinx4 = V4SplatElement<0>(nqMin);
	Vec4V nqMiny4 = V4SplatElement<1>(nqMin);
	Vec4V nqMinz4 = V4SplatElement<2>(nqMin);
	Vec4V nqMaxx4 = V4SplatElement<0>(nqMax);
	Vec4V nqMaxy4 = V4SplatElement<1>(nqMax);
	Vec4V nqMaxz4 = V4SplatElement<2>(nqMax);

	// on 64-bit platforms the dynamic rtree pointer is also relative to mPages
	PxU8* treeNodes8 = CAST_U8(mPages);
	PxU32* stackPtr = stack;

	// AP potential perf optimization - fetch the top level right away
	PX_ASSERT(RTREE_N == 4 || RTREE_N == 8);
	PX_ASSERT(Ps::isPowerOfTwo(mPageSize));

	for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --)
		*stackPtr++ = j*sizeof(RTreePage);

	PxU32 cacheTopValid = true;
	PxU32 cacheTop = 0;

	do {
		stackPtr--;
		PxU32 top;
		if (cacheTopValid) // branch is faster than lhs
			top = cacheTop;
		else
			top = stackPtr[0];
		PX_ASSERT(!cacheTopValid || stackPtr[0] == cacheTop);
		RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top);
		const PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs;

		Vec4V minx4 = V4LoadA(tn->minx);
		Vec4V miny4 = V4LoadA(tn->miny);
		Vec4V minz4 = V4LoadA(tn->minz);
		Vec4V maxx4 = V4LoadA(tn->maxx);
		Vec4V maxy4 = V4LoadA(tn->maxy);
		Vec4V maxz4 = V4LoadA(tn->maxz);

		// AABB/AABB overlap test
		BoolV res0 = V4IsGrtr(nqMinx4, maxx4); BoolV res1 = V4IsGrtr(nqMiny4, maxy4); BoolV res2 = V4IsGrtr(nqMinz4, maxz4);
		BoolV res3 = V4IsGrtr(minx4, nqMaxx4); BoolV res4 = V4IsGrtr(miny4, nqMaxy4); BoolV res5 = V4IsGrtr(minz4, nqMaxz4);
		BoolV resx = BOr(BOr(BOr(res0, res1), BOr(res2, res3)), BOr(res4, res5));
		PX_ALIGN_PREFIX(16) PxU32 resa[RTREE_N] PX_ALIGN_SUFFIX(16);

		VecU32V res4x = VecU32V_From_BoolV(resx); 
		U4StoreA(res4x, resa);

		cacheTopValid = false;
		for (PxU32 i = 0; i < RTREE_N; i++)
		{
			PxU32 ptr = ptrs[i] & ~1; // clear the isLeaf bit
			if (resa[i])
				continue;
			if (tn->isLeaf(i))
			{
				if (!callback->processResults(1, &ptr))
					return;
			}
			else
			{
				*(stackPtr++) = ptr;
				cacheTop = ptr;
				cacheTopValid = true;
			}
		}
	} while (stackPtr > stack);
}

/////////////////////////////////////////////////////////////////////////
template <int inflate>
void RTree::traverseRay(
	const PxVec3& rayOrigin, const PxVec3& rayDir,
	const PxU32 maxResults, PxU32* resultsPtr, Gu::RTree::CallbackRaycast* callback,
	const PxVec3* fattenAABBs, PxF32 maxT) const
{
	// implements Kay-Kajiya 4-way SIMD test
	PX_UNUSED(resultsPtr);
	PX_UNUSED(maxResults);

	const PxU32 maxStack = 128;
	PxU32 stack1[maxStack];
	PxU32* stack = stack1+1;

	PX_ASSERT(mPages);
	PX_ASSERT((uintptr_t(mPages) & 127) == 0);
	PX_ASSERT((uintptr_t(this) & 15) == 0);

	PxU8* treeNodes8 = CAST_U8(mPages);

	Vec4V fattenAABBsX, fattenAABBsY, fattenAABBsZ;
	PX_UNUSED(fattenAABBsX); PX_UNUSED(fattenAABBsY); PX_UNUSED(fattenAABBsZ);
	if (inflate)
	{
		Vec4V fattenAABBs4 = Vec4V_From_PxVec3_WUndefined(*fattenAABBs);
		fattenAABBs4 = V4Add(fattenAABBs4, epsInflateFloat4); // US2385 - shapes are "closed" meaning exactly touching shapes should report overlap
		fattenAABBsX = V4SplatElement<0>(fattenAABBs4);
		fattenAABBsY = V4SplatElement<1>(fattenAABBs4);
		fattenAABBsZ = V4SplatElement<2>(fattenAABBs4);
	}

	Vec4V maxT4;
	maxT4 = V4Load(maxT);
	Vec4V rayP = Vec4V_From_PxVec3_WUndefined(rayOrigin);
	Vec4V rayD = Vec4V_From_PxVec3_WUndefined(rayDir);
	VecU32V raySign = V4U32and(VecU32V_ReinterpretFrom_Vec4V(rayD), signMask);
	Vec4V rayDAbs = V4Abs(rayD); // abs value of rayD
	Vec4V rayInvD = Vec4V_ReinterpretFrom_VecU32V(V4U32or(raySign, VecU32V_ReinterpretFrom_Vec4V(V4Max(rayDAbs, epsFloat4)))); // clamp near-zero components up to epsilon
	rayD = rayInvD;

	//rayInvD = V4Recip(rayInvD);
	// Newton-Raphson iteration for reciprocal (see wikipedia):
	// X[n+1] = X[n]*(2-original*X[n]), X[0] = V4RecipFast estimate
	//rayInvD = rayInvD*(twos-rayD*rayInvD);
	rayInvD = V4RecipFast(rayInvD); // initial estimate, not accurate enough
	rayInvD = V4Mul(rayInvD, V4NegMulSub(rayD, rayInvD, twos));

	// P+tD=a; t=(a-P)/D
	// t=(a - p.x)*1/d.x = a/d.x +(- p.x/d.x)
	Vec4V rayPinvD = V4NegMulSub(rayInvD, rayP, zeroes);
	Vec4V rayInvDsplatX = V4SplatElement<0>(rayInvD);
	Vec4V rayInvDsplatY = V4SplatElement<1>(rayInvD);
	Vec4V rayInvDsplatZ = V4SplatElement<2>(rayInvD);
	Vec4V rayPinvDsplatX = V4SplatElement<0>(rayPinvD);
	Vec4V rayPinvDsplatY = V4SplatElement<1>(rayPinvD);
	Vec4V rayPinvDsplatZ = V4SplatElement<2>(rayPinvD);

	PX_ASSERT(RTREE_N == 4 || RTREE_N == 8);
	PX_ASSERT(mNumRootPages > 0);

	PxU32 stackPtr = 0;
	for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --)
		stack[stackPtr++] = j*sizeof(RTreePage);

	PX_ALIGN_PREFIX(16) PxU32 resa[4] PX_ALIGN_SUFFIX(16);

	while (stackPtr)
	{
		PxU32 top = stack[--stackPtr];
		if (top&1) // isLeaf test
		{
			top--;
			PxF32 newMaxT = maxT;
			if (!callback->processResults(1, &top, newMaxT))
				return;
			/* shrink the ray if newMaxT is reduced compared to the original maxT */
			if (maxT != newMaxT)
			{
				PX_ASSERT(newMaxT < maxT);
				maxT = newMaxT;
				maxT4 = V4Load(newMaxT);
			}
			continue;
		}

		RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top);
		
		// 6i load
		Vec4V minx4a = V4LoadA(tn->minx), miny4a = V4LoadA(tn->miny), minz4a = V4LoadA(tn->minz);
		Vec4V maxx4a = V4LoadA(tn->maxx), maxy4a = V4LoadA(tn->maxy), maxz4a = V4LoadA(tn->maxz);

		// 1i disabled test
		// AP scaffold - optimization opportunity - can save 2 instructions here
		VecU32V ignore4a = V4IsGrtrV32u(minx4a, maxx4a); // 1 if degenerate box (empty slot in the page)

		if (inflate)
		{
			// 6i
			maxx4a = V4Add(maxx4a, fattenAABBsX); maxy4a = V4Add(maxy4a, fattenAABBsY); maxz4a = V4Add(maxz4a, fattenAABBsZ);
			minx4a = V4Sub(minx4a, fattenAABBsX); miny4a = V4Sub(miny4a, fattenAABBsY); minz4a = V4Sub(minz4a, fattenAABBsZ);
		}

		// P+tD=a; t=(a-P)/D
		// t=(a - p.x)*1/d.x = a/d.x +(- p.x/d.x)
		// 6i
		Vec4V tminxa0 = V4MulAdd(minx4a, rayInvDsplatX, rayPinvDsplatX);
		Vec4V tminya0 = V4MulAdd(miny4a, rayInvDsplatY, rayPinvDsplatY);
		Vec4V tminza0 = V4MulAdd(minz4a, rayInvDsplatZ, rayPinvDsplatZ);
		Vec4V tmaxxa0 = V4MulAdd(maxx4a, rayInvDsplatX, rayPinvDsplatX);
		Vec4V tmaxya0 = V4MulAdd(maxy4a, rayInvDsplatY, rayPinvDsplatY);
		Vec4V tmaxza0 = V4MulAdd(maxz4a, rayInvDsplatZ, rayPinvDsplatZ);

		// test half-spaces
		// P+tD=dN
		// t = (d(N,D)-(P,D))/(D,D) , (D,D)=1

		// compute 4x dot products (N,D) and (P,N) for each AABB in the page

		// 6i
		// now compute tnear and tfar for each pair of planes for each box
		Vec4V tminxa = V4Min(tminxa0, tmaxxa0); Vec4V tmaxxa = V4Max(tminxa0, tmaxxa0);
		Vec4V tminya = V4Min(tminya0, tmaxya0); Vec4V tmaxya = V4Max(tminya0, tmaxya0);
		Vec4V tminza = V4Min(tminza0, tmaxza0); Vec4V tmaxza = V4Max(tminza0, tmaxza0);

		// 8i
		Vec4V maxOfNeasa = V4Max(V4Max(tminxa, tminya), tminza);
		Vec4V minOfFarsa = V4Min(V4Min(tmaxxa, tmaxya), tmaxza);
		ignore4a = V4U32or(ignore4a, V4IsGrtrV32u(epsFloat4, minOfFarsa));  // if tfar is negative, ignore since its a ray, not a line
		// AP scaffold: update the build to eliminate 3 more instructions for ignore4a above
		//VecU32V ignore4a = V4IsGrtrV32u(epsFloat4, minOfFarsa);  // if tfar is negative, ignore since its a ray, not a line
		ignore4a = V4U32or(ignore4a, V4IsGrtrV32u(maxOfNeasa, maxT4));  // if tnear is over maxT, ignore this result

		// 2i
		VecU32V resa4 = V4IsGrtrV32u(maxOfNeasa, minOfFarsa); // if 1 => fail
		resa4 = V4U32or(resa4, ignore4a);

		// 1i
		V4U32StoreAligned(resa4, reinterpret_cast<VecU32V*>(resa));

		PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs;

		stack[stackPtr] = ptrs[0]; stackPtr += (1+resa[0]); // AP scaffold TODO: use VecU32add
		stack[stackPtr] = ptrs[1]; stackPtr += (1+resa[1]);
		stack[stackPtr] = ptrs[2]; stackPtr += (1+resa[2]);
		stack[stackPtr] = ptrs[3]; stackPtr += (1+resa[3]);
	}
}

//explicit template instantiation
template void RTree::traverseRay<0>(const PxVec3&, const PxVec3&, const PxU32, PxU32*, Gu::RTree::CallbackRaycast*, const PxVec3*, PxF32) const;

template void RTree::traverseRay<1>(const PxVec3&, const PxVec3&, const PxU32, PxU32*, Gu::RTree::CallbackRaycast*, const PxVec3*, PxF32) const;

/////////////////////////////////////////////////////////////////////////
void RTree::traverseOBB(
	const Gu::Box& obb, const PxU32 maxResults, PxU32* resultsPtr, Gu::RTree::Callback* callback) const
{
	PX_UNUSED(resultsPtr);
	PX_UNUSED(maxResults);

	const PxU32 maxStack = 128;
	PxU32 stack[maxStack];

	PX_ASSERT(mPages);
	PX_ASSERT((uintptr_t(mPages) & 127) == 0);
	PX_ASSERT((uintptr_t(this) & 15) == 0);

	PxU8* treeNodes8 = CAST_U8(mPages);
	PxU32* stackPtr = stack;

	Vec4V ones, halves, eps;
	ones = V4Load(1.0f);
	halves = V4Load(0.5f);
	eps = V4Load(1e-6f);
	
	PX_UNUSED(ones);

	Vec4V obbO = Vec4V_From_PxVec3_WUndefined(obb.center);
	Vec4V obbE = Vec4V_From_PxVec3_WUndefined(obb.extents);
	// Gu::Box::rot matrix columns are the OBB axes
	Vec4V obbX = Vec4V_From_PxVec3_WUndefined(obb.rot.column0);
	Vec4V obbY = Vec4V_From_PxVec3_WUndefined(obb.rot.column1);
	Vec4V obbZ = Vec4V_From_PxVec3_WUndefined(obb.rot.column2);

#if PX_WINDOWS || PX_XBOXONE
	// Visual Studio compiler hangs with #defines
	// On VMX platforms we use #defines in the other branch of this #ifdef to avoid register spills (LHS)
	Vec4V obbESplatX = V4SplatElement<0>(obbE);
	Vec4V obbESplatY = V4SplatElement<1>(obbE);
	Vec4V obbESplatZ = V4SplatElement<2>(obbE);
	Vec4V obbESplatNegX = V4Sub(zeroes, obbESplatX);
	Vec4V obbESplatNegY = V4Sub(zeroes, obbESplatY);
	Vec4V obbESplatNegZ = V4Sub(zeroes, obbESplatZ);
	Vec4V obbXE = V4MulAdd(obbX, obbESplatX, zeroes); // scale axii by E
	Vec4V obbYE = V4MulAdd(obbY, obbESplatY, zeroes); // scale axii by E
	Vec4V obbZE = V4MulAdd(obbZ, obbESplatZ, zeroes); // scale axii by E
	Vec4V obbOSplatX = V4SplatElement<0>(obbO);
	Vec4V obbOSplatY = V4SplatElement<1>(obbO);
	Vec4V obbOSplatZ = V4SplatElement<2>(obbO);
	Vec4V obbXSplatX = V4SplatElement<0>(obbX);
	Vec4V obbXSplatY = V4SplatElement<1>(obbX);
	Vec4V obbXSplatZ = V4SplatElement<2>(obbX);
	Vec4V obbYSplatX = V4SplatElement<0>(obbY);
	Vec4V obbYSplatY = V4SplatElement<1>(obbY);
	Vec4V obbYSplatZ = V4SplatElement<2>(obbY);
	Vec4V obbZSplatX = V4SplatElement<0>(obbZ);
	Vec4V obbZSplatY = V4SplatElement<1>(obbZ);
	Vec4V obbZSplatZ = V4SplatElement<2>(obbZ);
	Vec4V obbXESplatX = V4SplatElement<0>(obbXE);
	Vec4V obbXESplatY = V4SplatElement<1>(obbXE);
	Vec4V obbXESplatZ = V4SplatElement<2>(obbXE);
	Vec4V obbYESplatX = V4SplatElement<0>(obbYE);
	Vec4V obbYESplatY = V4SplatElement<1>(obbYE);
	Vec4V obbYESplatZ = V4SplatElement<2>(obbYE);
	Vec4V obbZESplatX = V4SplatElement<0>(obbZE);
	Vec4V obbZESplatY = V4SplatElement<1>(obbZE);
	Vec4V obbZESplatZ = V4SplatElement<2>(obbZE);
#else
	#define obbESplatX V4SplatElement<0>(obbE)
	#define obbESplatY V4SplatElement<1>(obbE)
	#define obbESplatZ V4SplatElement<2>(obbE)
	#define obbESplatNegX V4Sub(zeroes, obbESplatX)
	#define obbESplatNegY V4Sub(zeroes, obbESplatY)
	#define obbESplatNegZ V4Sub(zeroes, obbESplatZ)
	#define obbXE V4MulAdd(obbX, obbESplatX, zeroes)
	#define obbYE V4MulAdd(obbY, obbESplatY, zeroes)
	#define obbZE V4MulAdd(obbZ, obbESplatZ, zeroes)
	#define obbOSplatX V4SplatElement<0>(obbO)
	#define obbOSplatY V4SplatElement<1>(obbO)
	#define obbOSplatZ V4SplatElement<2>(obbO)
	#define obbXSplatX V4SplatElement<0>(obbX)
	#define obbXSplatY V4SplatElement<1>(obbX)
	#define obbXSplatZ V4SplatElement<2>(obbX)
	#define obbYSplatX V4SplatElement<0>(obbY)
	#define obbYSplatY V4SplatElement<1>(obbY)
	#define obbYSplatZ V4SplatElement<2>(obbY)
	#define obbZSplatX V4SplatElement<0>(obbZ)
	#define obbZSplatY V4SplatElement<1>(obbZ)
	#define obbZSplatZ V4SplatElement<2>(obbZ)
	#define obbXESplatX V4SplatElement<0>(obbXE)
	#define obbXESplatY V4SplatElement<1>(obbXE)
	#define obbXESplatZ V4SplatElement<2>(obbXE)
	#define obbYESplatX V4SplatElement<0>(obbYE)
	#define obbYESplatY V4SplatElement<1>(obbYE)
	#define obbYESplatZ V4SplatElement<2>(obbYE)
	#define obbZESplatX V4SplatElement<0>(obbZE)
	#define obbZESplatY V4SplatElement<1>(obbZE)
	#define obbZESplatZ V4SplatElement<2>(obbZE)
#endif

	PX_ASSERT(mPageSize == 4 || mPageSize == 8);
	PX_ASSERT(mNumRootPages > 0);

	for (PxI32 j = PxI32(mNumRootPages-1); j >= 0; j --)
		*stackPtr++ = j*sizeof(RTreePage);
	PxU32 cacheTopValid = true;
	PxU32 cacheTop = 0;

	PX_ALIGN_PREFIX(16) PxU32 resa_[4] PX_ALIGN_SUFFIX(16);

	do {
		stackPtr--;

		PxU32 top;
		if (cacheTopValid) // branch is faster than lhs
			top = cacheTop;
		else
			top = stackPtr[0];
		PX_ASSERT(!cacheTopValid || top == cacheTop);
		RTreePage* PX_RESTRICT tn = reinterpret_cast<RTreePage*>(treeNodes8 + top);
		
		const PxU32 offs = 0;
		PxU32* ptrs = (reinterpret_cast<RTreePage*>(tn))->ptrs;

		// 6i
		Vec4V minx4a = V4LoadA(tn->minx+offs);
		Vec4V miny4a = V4LoadA(tn->miny+offs);
		Vec4V minz4a = V4LoadA(tn->minz+offs);
		Vec4V maxx4a = V4LoadA(tn->maxx+offs);
		Vec4V maxy4a = V4LoadA(tn->maxy+offs);
		Vec4V maxz4a = V4LoadA(tn->maxz+offs);

		VecU32V noOverlapa;
		VecU32V resa4u;
		{
			// PRECOMPUTE FOR A BLOCK
			// 109 instr per 4 OBB/AABB
			// ABB iteration 1, start with OBB origin as other point -- 6
			Vec4V p1ABBxa = V4Max(minx4a, V4Min(maxx4a, obbOSplatX));
			Vec4V p1ABBya = V4Max(miny4a, V4Min(maxy4a, obbOSplatY));
			Vec4V p1ABBza = V4Max(minz4a, V4Min(maxz4a, obbOSplatZ));

			// OBB iteration 1, move to OBB space first -- 12
			Vec4V p1ABBOxa = V4Sub(p1ABBxa, obbOSplatX);
			Vec4V p1ABBOya = V4Sub(p1ABBya, obbOSplatY);
			Vec4V p1ABBOza = V4Sub(p1ABBza, obbOSplatZ);
			Vec4V obbPrjXa = V4MulAdd(p1ABBOxa, obbXSplatX, V4MulAdd(p1ABBOya, obbXSplatY, V4MulAdd(p1ABBOza, obbXSplatZ, zeroes)));
			Vec4V obbPrjYa = V4MulAdd(p1ABBOxa, obbYSplatX, V4MulAdd(p1ABBOya, obbYSplatY, V4MulAdd(p1ABBOza, obbYSplatZ, zeroes)));
			Vec4V obbPrjZa = V4MulAdd(p1ABBOxa, obbZSplatX, V4MulAdd(p1ABBOya, obbZSplatY, V4MulAdd(p1ABBOza, obbZSplatZ, zeroes)));
			// clamp AABB point in OBB space to OBB extents. Since we scaled the axii, the extents are [-1,1] -- 6
			Vec4V pOBBxa = V4Max(obbESplatNegX, V4Min(obbPrjXa, obbESplatX));
			Vec4V pOBBya = V4Max(obbESplatNegY, V4Min(obbPrjYa, obbESplatY));
			Vec4V pOBBza = V4Max(obbESplatNegZ, V4Min(obbPrjZa, obbESplatZ));
			// go back to AABB space. we have x,y,z in obb space, need to multiply by axii -- 9
			Vec4V p1OBBxa = V4MulAdd(pOBBxa, obbXSplatX, V4MulAdd(pOBBya, obbYSplatX, V4MulAdd(pOBBza, obbZSplatX, obbOSplatX)));
			Vec4V p1OBBya = V4MulAdd(pOBBxa, obbXSplatY, V4MulAdd(pOBBya, obbYSplatY, V4MulAdd(pOBBza, obbZSplatY, obbOSplatY)));
			Vec4V p1OBBza = V4MulAdd(pOBBxa, obbXSplatZ, V4MulAdd(pOBBya, obbYSplatZ, V4MulAdd(pOBBza, obbZSplatZ, obbOSplatZ)));

			// ABB iteration 2 -- 6 instructions
			Vec4V p2ABBxa = V4Max(minx4a, V4Min(maxx4a, p1OBBxa));
			Vec4V p2ABBya = V4Max(miny4a, V4Min(maxy4a, p1OBBya));
			Vec4V p2ABBza = V4Max(minz4a, V4Min(maxz4a, p1OBBza));
			// above blocks add up to 12+12+15=39 instr
			// END PRECOMPUTE FOR A BLOCK

			// for AABBs precompute extents and center -- 9i
			Vec4V abbCxa = V4MulAdd(V4Add(maxx4a, minx4a), halves, zeroes);
			Vec4V abbCya = V4MulAdd(V4Add(maxy4a, miny4a), halves, zeroes);
			Vec4V abbCza = V4MulAdd(V4Add(maxz4a, minz4a), halves, zeroes);
			Vec4V abbExa = V4Sub(maxx4a, abbCxa);
			Vec4V abbEya = V4Sub(maxy4a, abbCya);
			Vec4V abbEza = V4Sub(maxz4a, abbCza);

			// now test separating axes D1 = p1OBB-p1ABB and D2 = p1OBB-p2ABB -- 37 instructions per axis
			// D1 first -- 3 instructions
			Vec4V d1xa = V4Sub(p1OBBxa, p1ABBxa), d1ya = V4Sub(p1OBBya, p1ABBya), d1za = V4Sub(p1OBBza, p1ABBza);

			// for AABB compute projections of extents and center -- 6
			Vec4V abbExd1Prja = V4MulAdd(d1xa, abbExa, zeroes);
			Vec4V abbEyd1Prja = V4MulAdd(d1ya, abbEya, zeroes);
			Vec4V abbEzd1Prja = V4MulAdd(d1za, abbEza, zeroes);
			Vec4V abbCd1Prja = V4MulAdd(d1xa, abbCxa, V4MulAdd(d1ya, abbCya, V4MulAdd(d1za, abbCza, zeroes)));

			// for obb project each halfaxis and origin and add abs values of half-axis projections -- 12 instructions
			Vec4V obbXEd1Prja = V4MulAdd(d1xa, obbXESplatX, V4MulAdd(d1ya, obbXESplatY, V4MulAdd(d1za, obbXESplatZ, zeroes)));
			Vec4V obbYEd1Prja = V4MulAdd(d1xa, obbYESplatX, V4MulAdd(d1ya, obbYESplatY, V4MulAdd(d1za, obbYESplatZ, zeroes)));
			Vec4V obbZEd1Prja = V4MulAdd(d1xa, obbZESplatX, V4MulAdd(d1ya, obbZESplatY, V4MulAdd(d1za, obbZESplatZ, zeroes)));
			Vec4V obbOd1Prja = V4MulAdd(d1xa, obbOSplatX, V4MulAdd(d1ya, obbOSplatY, V4MulAdd(d1za, obbOSplatZ, zeroes)));

			// compare lengths between projected centers with sum of projected radii -- 16i
			Vec4V originDiffd1a = v_absm(V4Sub(abbCd1Prja, obbOd1Prja));
			Vec4V absABBRd1a = V4Add(V4Add(v_absm(abbExd1Prja), v_absm(abbEyd1Prja)), v_absm(abbEzd1Prja));
			Vec4V absOBBRd1a = V4Add(V4Add(v_absm(obbXEd1Prja), v_absm(obbYEd1Prja)), v_absm(obbZEd1Prja));
			VecU32V noOverlapd1a = V4IsGrtrV32u(V4Sub(originDiffd1a, eps), V4Add(absABBRd1a, absOBBRd1a));
			VecU32V epsNoOverlapd1a = V4IsGrtrV32u(originDiffd1a, eps);

			// D2 next (35 instr)
			// 3i
			Vec4V d2xa = V4Sub(p1OBBxa, p2ABBxa), d2ya = V4Sub(p1OBBya, p2ABBya), d2za = V4Sub(p1OBBza, p2ABBza);
			// for AABB compute projections of extents and center -- 6
			Vec4V abbExd2Prja = V4MulAdd(d2xa, abbExa, zeroes);
			Vec4V abbEyd2Prja = V4MulAdd(d2ya, abbEya, zeroes);
			Vec4V abbEzd2Prja = V4MulAdd(d2za, abbEza, zeroes);
			Vec4V abbCd2Prja = V4MulAdd(d2xa, abbCxa, V4MulAdd(d2ya, abbCya, V4MulAdd(d2za, abbCza, zeroes)));
			// for obb project each halfaxis and origin and add abs values of half-axis projections -- 12i
			Vec4V obbXEd2Prja = V4MulAdd(d2xa, obbXESplatX, V4MulAdd(d2ya, obbXESplatY, V4MulAdd(d2za, obbXESplatZ, zeroes)));
			Vec4V obbYEd2Prja = V4MulAdd(d2xa, obbYESplatX, V4MulAdd(d2ya, obbYESplatY, V4MulAdd(d2za, obbYESplatZ, zeroes)));
			Vec4V obbZEd2Prja = V4MulAdd(d2xa, obbZESplatX, V4MulAdd(d2ya, obbZESplatY, V4MulAdd(d2za, obbZESplatZ, zeroes)));
			Vec4V obbOd2Prja = V4MulAdd(d2xa, obbOSplatX, V4MulAdd(d2ya, obbOSplatY, V4MulAdd(d2za, obbOSplatZ, zeroes)));
			// compare lengths between projected centers with sum of projected radii -- 16i
			Vec4V originDiffd2a = v_absm(V4Sub(abbCd2Prja, obbOd2Prja));
			Vec4V absABBRd2a = V4Add(V4Add(v_absm(abbExd2Prja), v_absm(abbEyd2Prja)), v_absm(abbEzd2Prja));
			Vec4V absOBBRd2a = V4Add(V4Add(v_absm(obbXEd2Prja), v_absm(obbYEd2Prja)), v_absm(obbZEd2Prja));
			VecU32V noOverlapd2a = V4IsGrtrV32u(V4Sub(originDiffd2a, eps), V4Add(absABBRd2a, absOBBRd2a));
			VecU32V epsNoOverlapd2a = V4IsGrtrV32u(originDiffd2a, eps);

			// 8i
			noOverlapa = V4U32or(V4U32and(noOverlapd1a, epsNoOverlapd1a), V4U32and(noOverlapd2a, epsNoOverlapd2a));
			VecU32V ignore4a = V4IsGrtrV32u(minx4a, maxx4a); // 1 if degenerate box (empty slot)
			noOverlapa = V4U32or(noOverlapa, ignore4a);
			resa4u = V4U32Andc(U4Load(1), noOverlapa); // 1 & ~noOverlap
			V4U32StoreAligned(resa4u, reinterpret_cast<VecU32V*>(resa_));
			///// 8+16+12+6+3+16+12+6+3+9+6+9+6+12+6+6=136i from load to result
		}

		cacheTopValid = false;
		for (PxU32 i = 0; i < 4; i++)
		{
			PxU32 ptr = ptrs[i+offs] & ~1; // clear the isLeaf bit
			if (resa_[i])
			{
				if (tn->isLeaf(i))
				{
					if (!callback->processResults(1, &ptr))
						return;
				}
				else
				{
					*(stackPtr++) = ptr;
					cacheTop = ptr;
					cacheTopValid = true;
				}
			}
		}
	} while (stackPtr > stack);
}

} // namespace Gu

}