diff options
| author | FluorescentCIAAfricanAmerican <[email protected]> | 2020-04-22 12:56:21 -0400 |
|---|---|---|
| committer | FluorescentCIAAfricanAmerican <[email protected]> | 2020-04-22 12:56:21 -0400 |
| commit | 3bf9df6b2785fa6d951086978a3e66f49427166a (patch) | |
| tree | 2c0f1f0c63c4832882bc93814ebd2c2b1c6224e5 /studiorender/r_studiodraw_computeflexedvertex.cpp | |
| download | archived-source-engine-2018-hl2-src-master.tar.xz archived-source-engine-2018-hl2-src-master.zip | |
Diffstat (limited to 'studiorender/r_studiodraw_computeflexedvertex.cpp')
| -rw-r--r-- | studiorender/r_studiodraw_computeflexedvertex.cpp | 1621 |
1 files changed, 1621 insertions, 0 deletions
diff --git a/studiorender/r_studiodraw_computeflexedvertex.cpp b/studiorender/r_studiodraw_computeflexedvertex.cpp new file mode 100644 index 0000000..b58b90d --- /dev/null +++ b/studiorender/r_studiodraw_computeflexedvertex.cpp @@ -0,0 +1,1621 @@ +//========= Copyright c 1996-2008, Valve Corporation, All rights reserved. ============// + +#include "tier0/platform.h" + +#ifdef PLATFORM_WINDOWS + +#include "studiorender.h" +#include "studio.h" +#include "materialsystem/imesh.h" +#include "materialsystem/imaterialsystemhardwareconfig.h" +#include "materialsystem/imaterialvar.h" +#include "materialsystem/imorph.h" +#include "materialsystem/itexture.h" +#include "materialsystem/imaterial.h" +#include "optimize.h" +#include "mathlib/mathlib.h" +#include "mathlib/vector.h" +#include <malloc.h> +#include "mathlib/vmatrix.h" +#include "studiorendercontext.h" +#include "tier2/tier2.h" +#include "tier0/vprof.h" +//#include "tier0/miniprofiler.h" +#include <algorithm> +#include "filesystem.h" + +#define PROFILE_THIS_FILE 0 + + +//DLL_IMPORT CLinkedMiniProfiler *g_pOtherMiniProfilers; +#if PROFILE_THIS_FILE + +#if !ENABLE_HARDWARE_PROFILER +#error "can't profile without profiler enabled" +#endif + +CLinkedMiniProfiler g_mp_morph_Vx("morph_Vx", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_Vw("morph_Vw", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_lower_bound("morph_lower_bound", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph("morph", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_V1("morph_V1", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_V2("morph_V2", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_V3("morph_V3", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_V4("morph_V4", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_V5("morph_V5", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_V6("morph_V6", &g_pOtherMiniProfilers); +CLinkedMiniProfiler g_mp_morph_V7("morph_V7", &g_pOtherMiniProfilers); + +CLinkedMiniProfiler* g_mp_ComputeFlexedVertex_StreamOffset[8] = +{ + NULL, + &g_mp_morph_V1, + &g_mp_morph_V2, + &g_mp_morph_V3, + &g_mp_morph_V4, + &g_mp_morph_V5, + &g_mp_morph_V6, + &g_mp_morph_V7 +}; +#else +uint32 g_mp_morph_Vx[2]; +uint32 g_mp_morph_Vw[2]; +#endif + +#ifdef _X360 +ConVar g_cv_morph_path("morph_path", "7"); +#ifdef _DEBUG +ConVar g_cv_morph_debug("morph_debug", "0"); +#endif // _DEBUG +#endif // _X360 + + +#ifdef _X360 +const ALIGN16 int32 g_perm_speed_side[4] = {0x12, 0x13, 0x12, 0x13}; +const ALIGN16 int32 g_perm_delta[4] = {0x14150000, 0x16170000, 0x18190000, 0}; +const ALIGN16 int32 g_perm_delta_wrinkle[4] = {0x14150000, 0x16170000, 0x18190000, 0x10110000}; // includes the f3PreDelta's W that's in the X component +const ALIGN16 int32 g_perm_ndelta[4] = {0x1A1B0000, 0x1C1D0000, 0x1E1F0000, 0}; +//const ALIGN16 int32 g_perm_w0[4] = {0x00010203,0x08090A0B,0x00010203,0x08090A0B}; +const ALIGN16 int32 g_perm_w1[4] = {0x0C0D0E0F,0x0C0D0E0F,0x04050607,0x04050607}; +const fltx4 g_sc256_255_special = {256.0f/255.0f,256.0f/255.0f,-256.0f/255.0f,-256.0f/255.0f}; +const fltx4 g_f40011 = {0,0,1,1}; +fltx4 g_dummy2[2]; + +int g_nStreamOffset_prefetch = 256; + + + + + + + + + + + + + + +// +// V4 rolled - latency of x4, manually scheduled for nearly optimal dual-issue and no automatic stalls +// the ~15 nops mean 1 instruction is issued at that cycle, instead of theoretically possible 2 per cycle +// +__declspec(naked) int ComputeFlexedVertex_StreamOffset_V7( + int nThinFlexVertexCount, //r3 + CachedPosNorm_t *pThinFlexVerts,//r4 + int32 *pFirstThinFlexIndex, //r5 + mstudiovertanim_t * pVert, //r6 + uint32 nCurrentTag, //r7 + uint32 numVertsToProcess, //r8 + fltx4 w1234 //vr1 + ) +{ + __asm + { + std r14, -0x08(r1) + std r15, -0x10(r1) + std r16, -0x18(r1) + std r17, -0x20(r1) + std r18, -0x28(r1) + std r19, -0x30(r1) + std r20, -0x38(r1) + std r21, -0x40(r1) + std r22, -0x48(r1) + std r23, -0x50(r1) + std r24, -0x58(r1) + std r25, -0x60(r1) + + // let the compiler schedule the instructions, just use several registers to avoid dependencies + lau r14, g_sc256_255_special + lal r14, r14, g_sc256_255_special + lvx vr2, r0,r14 + + lau r15, g_f40011 + lal r15, r15, g_f40011 + lvx vr3, r0,r15 + + lau r16, g_perm_speed_side + lal r16, r16, g_perm_speed_side + lvx vr4, r0,r16 + + lau r17, g_perm_delta + lal r17, r17, g_perm_delta + lvx vr5, r0,r17 + + lau r18, g_perm_ndelta + lal r18, r18, g_perm_ndelta + lvx vr6, r0,r18 + + lau r20, g_dummy2 + lal r20,r20, g_dummy2 + mr r21, r20 + mr r22, r21 + mr r23, r22 + + li r10, -1 + rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation + rldicl r10,r10,0,48 // r10 = 0x0000FFFF + + vxor vr8,vr8,vr8 + + li r15, 16 + + li r11,0x100 + li r24, MAXSTUDIOFLEXVERTS - 4 + + mtctr r8 + mftb r25 + vxor vr19,vr19,vr19 + vxor vr20,vr20,vr20 + nop // align! + nop + nop + +label_start_V7: // 52 instructions run in 45 cycles, although compiler predicts 38 cycles + //////////////// + // IMPORTANT: DO NOT REMOVE NOPS UNLESS YOU KNOW WHAT YOU ARE DOING AND WHY! + // nops are essential here, removing them will make the code about 2% slower because dual-issue will be broken + //////////////// + lhz r14, 0(r6) // int n = pVert->index; + addi r16, r3, 2 + dcbt r11,r6 + cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2 + lvlx vr9,r0,r6 + rldicl r14, r14, 2, 0 // r14 = n*4 + lvrx vr10,r15,r6 + rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts + vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert) + addi r31,r31,0//vpermwi128 vr40,vr40,0x1B //mr r31,r31 + add r16, r16, r4 + vpermwi128 vr40,vr40,0x1B //mr r30,r30 + addi r6, r6, 0x10 // pVert++ + vpermwi128 vr41,vr41,0x1B//nop + lwzx r17, r14, r5 // r17 = oldCache + //addi r30,r30,0//nop + vperm vr10, vr8, vr9, vr4 + //addi r29,r29,0//nop + xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag + vperm vr11, vr8, vr9, vr5 + stvx vr8, r0,r16 + /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight + subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32 + /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb + stvx vr8, r15,r16 + /*S:1*/ vpermwi128 vr26, vr20, 0xF5 + vcsxwfp vr10,vr10,8 + or r19,r3,r7 + vperm vr12, vr8, vr9, vr6 + sradi r18,r18,32 // r18 = isCacheInvalid : form mask + /*S:3*/ stvx vr30, r0,r23 + //nop + /*S:3*/ stvx vr31, r15,r23 + //nop + andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid + //nop + subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1); + //nop + and r19,r19,r18 // r19 = newCache & isCacheInvalid + //nop + /*S:2*/mr r23,r22 + //nop + or r19, r19, r17 // r19 = updateCache + /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition + /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal + //nop + rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32 + //nop + /*S:1*/ vmulfp128 vr19, vr25, vr26 + /*S:1*/mr r22, r21 + vmaddfp vr20, vr10, vr2, vr3 // vr20 = f4sb + add r21, r17, r4 // r21 = pFlexedVertex, goes to Stage:1 + /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + stwx r19, r14, r5 + /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + /*S:1*/ vpermwi128 vr21, vr32, 0x1B + /*S:1*/ vpermwi128 vr22, vr33, 0x1B + vcsxwfp128 vr32, vr11, 28 + //nop + vcsxwfp128 vr33, vr12, 28 + bgt label_end_V7 + dcbt r11, r21 + bdnz label_start_V7 +label_end_V7: + + /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight + /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb + /*S:1*/ vpermwi128 vr26, vr20, 0xF5 + /*S:3*/ stvx vr30, r0,r23 + /*S:3*/ stvx vr31, r15,r23 + /*S:2*/mr r23,r22 + /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition + /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal + /*S:1*/ vmulfp128 vr19, vr25, vr26 + /*S:1*/mr r22, r21 + /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + /*S:1*/ vpermwi128 vr21, vr32, 0x1B + /*S:1*/ vpermwi128 vr22, vr33, 0x1B + + + /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight + /*S:3*/ stvx vr30, r0,r23 + /*S:3*/ stvx vr31, r15,r23 + /*S:2*/mr r23,r22 + /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition + /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal + /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + + /*S:3*/ stvx vr30, r0,r23 + /*S:3*/ stvx vr31, r15,r23 + + mftb r17 + subf r17, r25, r17 + lau r18, g_mp_morph_Vx + lal r18, r18, g_mp_morph_Vx + lwz r23, 0(r18) + add r23,r23,r17 + stw r23, 0(r18) + lwz r23, 4(r18) + add r23,r23,r8 + stw r23, 4(r18) + + ld r14, -0x08(r1) + ld r15, -0x10(r1) + ld r16, -0x18(r1) + ld r17, -0x20(r1) + ld r18, -0x28(r1) + ld r19, -0x30(r1) + ld r20, -0x38(r1) + ld r21, -0x40(r1) + ld r22, -0x48(r1) + ld r23, -0x50(r1) + ld r24, -0x58(r1) + ld r25, -0x60(r1) + + blr + } +} + + + + +__declspec(naked) int ComputeFlexedVertexWrinkle_StreamOffset_V7( + int nThinFlexVertexCount, //r3 + CachedPosNorm_t *pThinFlexVerts,//r4 + int32 *pFirstThinFlexIndex, //r5 + mstudiovertanim_wrinkle_t * pVert, //r6 + uint32 nCurrentTag, //r7 + uint32 numVertsToProcess, //r8 + fltx4 w1234 //vr1 + ) +{ + __asm + { + std r14, -0x08(r1) + std r15, -0x10(r1) + std r16, -0x18(r1) + std r17, -0x20(r1) + std r18, -0x28(r1) + std r19, -0x30(r1) + std r20, -0x38(r1) + std r21, -0x40(r1) + std r22, -0x48(r1) + std r23, -0x50(r1) + std r24, -0x58(r1) + std r25, -0x60(r1) + + // let the compiler schedule the instructions, just use several registers to avoid dependencies + lau r14, g_sc256_255_special + lal r14, r14, g_sc256_255_special + lvx vr2, r0,r14 + + lau r15, g_f40011 + lal r15, r15, g_f40011 + lvx vr3, r0,r15 + + lau r16, g_perm_speed_side + lal r16, r16, g_perm_speed_side + lvx vr4, r0,r16 + + lau r17, g_perm_delta_wrinkle + lal r17, r17, g_perm_delta_wrinkle + lvx vr5, r0,r17 + + lau r18, g_perm_ndelta + lal r18, r18, g_perm_ndelta + lvx vr6, r0,r18 + + lau r20, g_dummy2 + lal r20,r20, g_dummy2 + mr r21, r20 + mr r22, r21 + mr r23, r22 + + li r10, -1 + rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation + rldicl r10,r10,0,48 // r10 = 0x0000FFFF + + vxor vr8,vr8,vr8 + + li r15, 16 + + li r11,0x100 + li r24, MAXSTUDIOFLEXVERTS - 4 + + mtctr r8 + mftb r25 + vxor vr19,vr19,vr19 + vxor vr20,vr20,vr20 + nop // align! + nop + nop + +label_start_V7: // 52 instructions run in 45 cycles, although compiler predicts 38 cycles + //////////////// + // IMPORTANT: DO NOT REMOVE NOPS UNLESS YOU KNOW WHAT YOU ARE DOING AND WHY! + // nops are essential here, removing them will make the code about 2% slower because dual-issue will be broken + //////////////// + lhz r14, 0(r6) // int n = pVert->index; + addi r16, r3, 2 + dcbt r11,r6 + cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2 + lvlx vr9,r0,r6 + rldicl r14, r14, 2, 0 // r14 = n*4 + lvrx vr10,r15,r6 + rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts + lvlx vr27,r15,r6 // f3PreDelta + vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert) + addi r31,r31,0//vpermwi128 vr40,vr40,0x1B //mr r31,r31 + add r16, r16, r4 + vpermwi128 vr40,vr40,0x1B //mr r30,r30 + addi r6, r6, 0x12 // pVert++ + vpermwi128 vr41,vr41,0x1B//nop + lwzx r17, r14, r5 // r17 = oldCache + //addi r30,r30,0//nop + vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide) + vrlimi128 vr27,vr9,7,0// f3PreDelta + xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag + vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta) + stvx vr8, r0,r16 + /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight + subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32 + /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb + stvx vr8, r15,r16 + /*S:1*/ vpermwi128 vr26, vr20, 0xF5 + vcsxwfp vr10,vr10,8 + or r19,r3,r7 + vperm vr11, vr8, vr27, vr5 //f3Delta = __vperm(f4Zero, f3PreDelta, permuteDelta) + sradi r18,r18,32 // r18 = isCacheInvalid : form mask + /*S:3*/ stvx vr30, r0,r23 + //nop + /*S:3*/ stvx vr31, r15,r23 + //nop + andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid + //nop + subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1); + //nop + and r19,r19,r18 // r19 = newCache & isCacheInvalid + //nop + /*S:2*/mr r23,r22 + //nop + or r19, r19, r17 // r19 = updateCache + /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition + /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal + //nop + rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32 + //nop + /*S:1*/ vmulfp128 vr19, vr25, vr26 + /*S:1*/mr r22, r21 + vmaddfp vr20, vr10, vr2, vr3 // vr20 = f4sb + add r21, r17, r4 // r21 = pFlexedVertex, goes to Stage:1 + /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + stwx r19, r14, r5 + /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + /*S:1*/ vpermwi128 vr21, vr32, 0x1B + /*S:1*/ vpermwi128 vr22, vr33, 0x1B + vcsxwfp128 vr32, vr11, 28 + //nop + vcsxwfp128 vr33, vr12, 28 + bgt label_end_V7 + dcbt r11, r21 + bdnz label_start_V7 +label_end_V7: + + /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight + /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb + /*S:1*/ vpermwi128 vr26, vr20, 0xF5 + /*S:3*/ stvx vr30, r0,r23 + /*S:3*/ stvx vr31, r15,r23 + /*S:2*/mr r23,r22 + /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition + /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal + /*S:1*/ vmulfp128 vr19, vr25, vr26 + /*S:1*/mr r22, r21 + /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + /*S:1*/ vpermwi128 vr21, vr32, 0x1B + /*S:1*/ vpermwi128 vr22, vr33, 0x1B + + + /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight + /*S:3*/ stvx vr30, r0,r23 + /*S:3*/ stvx vr31, r15,r23 + /*S:2*/mr r23,r22 + /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition + /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal + /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + + /*S:3*/ stvx vr30, r0,r23 + /*S:3*/ stvx vr31, r15,r23 + + mftb r17 + subf r17, r25, r17 + lau r18, g_mp_morph_Vw + lal r18, r18, g_mp_morph_Vw + lwz r23, 0(r18) + add r23,r23,r17 + stw r23, 0(r18) + lwz r23, 4(r18) + add r23,r23,r8 + stw r23, 4(r18) + + ld r14, -0x08(r1) + ld r15, -0x10(r1) + ld r16, -0x18(r1) + ld r17, -0x20(r1) + ld r18, -0x28(r1) + ld r19, -0x30(r1) + ld r20, -0x38(r1) + ld r21, -0x40(r1) + ld r22, -0x48(r1) + ld r23, -0x50(r1) + ld r24, -0x58(r1) + ld r25, -0x60(r1) + + blr + } +} + + + + +// V4 rolled - latency of x3 +__declspec(naked) int ComputeFlexedVertex_StreamOffset_V6( + int nThinFlexVertexCount, //r3 + CachedPosNorm_t *pThinFlexVerts,//r4 + int32 *pFirstThinFlexIndex, //r5 + mstudiovertanim_t * pVert, //r6 + uint32 nCurrentTag, //r7 + uint32 numVertsToProcess, //r8 + fltx4 w1234 //vr1 + ) +{ + __asm + { + std r14, -0x08(r1) + std r15, -0x10(r1) + std r16, -0x18(r1) + std r17, -0x20(r1) + std r18, -0x28(r1) + std r19, -0x30(r1) + std r20, -0x38(r1) + std r21, -0x40(r1) + std r22, -0x48(r1) + std r23, -0x50(r1) + std r24, -0x58(r1) + + // let the compiler schedule the instructions, just use several registers to avoid dependencies + lau r14, g_sc256_255_special + lal r14, r14, g_sc256_255_special + lvx vr2, r0,r14 + + lau r15, g_f40011 + lal r15, r15, g_f40011 + lvx vr3, r0,r15 + + lau r16, g_perm_speed_side + lal r16, r16, g_perm_speed_side + lvx vr4, r0,r16 + + lau r17, g_perm_delta + lal r17, r17, g_perm_delta + lvx vr5, r0,r17 + + lau r18, g_perm_ndelta + lal r18, r18, g_perm_ndelta + lvx vr6, r0,r18 + + lau r20, g_dummy2 + lal r20,r20, g_dummy2 + mr r21, r20 + mr r22, r21 + + li r10, -1 + rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation + rldicl r10,r10,0,48 // r10 = 0x0000FFFF + + vxor vr8,vr8,vr8 + + li r15, 16 + + lau r14,g_nStreamOffset_prefetch + lal r14,r14,g_nStreamOffset_prefetch + lwz r11,0(r14) + + li r24, MAXSTUDIOFLEXVERTS - 2 + + mtctr r8 + mftb r23 + +label_start: + lhz r14, 0(r6) // int n = pVert->index; + dcbt r11,r6 + addi r16, r3, 2 + cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2 + lvlx vr9,r0,r6 + lvrx vr10,r15,r6 + rldicl r14, r14, 2, 0 // r14 = n*4 + rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts + add r16, r16, r4 + vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert) + stvx vr8, r0,r16 + lwzx r17, r14, r5 // r17 = oldCache + stvx vr8, r15,r16 + vmsum4fp128 vr19,vr19, vr1 // vr15 = scWeight + vperm vr10, vr8, vr9, vr4 + xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag + vperm vr11, vr8, vr9, vr5 + subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32 + vcsxwfp vr10,vr10,8 + vperm vr12, vr8, vr9, vr6 + stvx vr23, r0,r22 + sradi r18,r18,32 // r18 = isCacheInvalid : form mask + vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb + stvx vr24, r15,r22 + or r19,r3,r7 + andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid + and r19,r19,r18 // r19 = newCache & isCacheInvalid + vpermwi128 vr15, vr10, 0x22 + or r19, r19, r17 // r19 = updateCache + vpermwi128 vr16, vr10, 0xF5 + rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32 + vmaddfp vr24, vr19, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + vmaddfp vr23, vr19, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + vmulfp128 vr19, vr15, vr16 + add r17, r17, r4 // r17 = pFlexedVertex + stwx r19, r14, r5 + subf r3, r18, r3// nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1); + lvx vr13, r0,r17 // vr13 = vfPosition + addi r6, r6, 0x10 // pVert++ + lvx vr14, r15,r17 // vr14 = vfNormal + vcsxwfp vr21, vr11, 28 + mr r22,r21 + vcsxwfp vr22, vr12, 28 + mr r21,r17 + bgt label_end + dcbt r11, r17 + + bdnz label_start +label_end: + + mftb r17 + subf r17, r23, r17 + lau r18, g_mp_morph_Vx + lal r18, r18, g_mp_morph_Vx + lwz r23, 0(r18) + add r23,r23,r17 + stw r23, 0(r18) + lwz r23, 4(r18) + add r23,r23,r8 + stw r23, 4(r18) + + + vmsum4fp128 vr19,vr19, vr1 // vr15 = scWeight + stvx vr23, r0,r22 + stvx vr24, r15,r22 + vmaddfp vr24, vr19, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + vmaddfp vr23, vr19, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + stvx vr23, r0,r21 + stvx vr24, r15,r21 + + ld r14, -0x08(r1) + ld r15, -0x10(r1) + ld r16, -0x18(r1) + ld r17, -0x20(r1) + ld r18, -0x28(r1) + ld r19, -0x30(r1) + ld r20, -0x38(r1) + ld r21, -0x40(r1) + ld r22, -0x48(r1) + ld r23, -0x50(r1) + ld r24, -0x58(r1) + + blr + } +} + + + +// 2-stages +__declspec(naked) int ComputeFlexedVertex_StreamOffset_V5( + int nThinFlexVertexCount, //r3 + CachedPosNorm_t *pThinFlexVerts,//r4 + int32 *pFirstThinFlexIndex, //r5 + mstudiovertanim_t * pVert, //r6 + uint32 nCurrentTag, //r7 + uint32 numVertsToProcess, //r8 + fltx4 w1234 //vr1 + ) +{ + __asm + { + std r14, -0x08(r1) + std r15, -0x10(r1) + std r16, -0x18(r1) + std r17, -0x20(r1) + std r18, -0x28(r1) + std r19, -0x30(r1) + std r20, -0x38(r1) + + // let the compiler schedule the instructions, just use several registers to avoid dependencies + lau r14, g_sc256_255_special + lal r14, r14, g_sc256_255_special + lvx vr2, r0,r14 + + lau r15, g_f40011 + lal r15, r15, g_f40011 + lvx vr3, r0,r15 + + lau r16, g_perm_speed_side + lal r16, r16, g_perm_speed_side + lvx vr4, r0,r16 + + lau r17, g_perm_delta + lal r17, r17, g_perm_delta + lvx vr5, r0,r17 + + lau r18, g_perm_ndelta + lal r18, r18, g_perm_ndelta + lvx vr6, r0,r18 + + lau r20, g_dummy2 + lal r20,r20, g_dummy2 + + vxor vr8,vr8,vr8 + li r10, -1 + rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation + rldicl r10,r10,0,48 // r10 = 0x0000FFFF + mtctr r8 + + li r15, 16 + +label_start_schlp: + lhz r14, 0(r6) // int n = pVert->index; + addi r16, r3, 2 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts + lvlx vr9,r0,r6 + rldicl r14, r14, 2, 0 // r14 = n*4 + lvrx vr10,r15,r6 + rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts + + vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert) + + add r16, r16, r4 + + vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide) + addi r6, r6, 0x10 // pVert++ + vcsxwfp vr10,vr10,8 + + vmaddfp vr17, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) - stage 1 + vmaddfp vr18, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) - stage 1 + + vperm vr11, vr8, vr9, vr5 //f3Delta = __vperm(f4Zero, packedVert, permuteDelta) + vcsxwfp vr11, vr11, 28 + vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta) + vcsxwfp vr12, vr12, 28 + + vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb + + lwzx r17, r14, r5 // r17 = oldCache + xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag + subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32 + + or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount + sradi r18,r18,32 // r18 = isCacheInvalid : form mask + vpermwi128 vr15, vr10, 0x22 + and r19,r19,r18 // r19 = newCache & isCacheInvalid + vpermwi128 vr16, vr10, 0xF5 + andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid + stvx vr8, r0, r16 + or r19, r19, r17 // r19 = updateCache + stvx vr8, r15, r16 + + rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32 + add r17, r17, r4 // r17 = pFlexedVertex + vmulfp128 vr15, vr15, vr16 + lvx vr13, r0,r17 // vr13 = vfPosition + lvx vr14, r15,r17 // vr14 = vfNormal + + vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight + + stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache + subf r3, r18, r3// nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1); + + stvx vr17, r0,r20 // stage 1 + stvx vr18, r15,r20 // stage 1 + + mr r20, r17 + + bdnz label_start_schlp + + vmaddfp vr17, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) - stage 1 + vmaddfp vr18, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) - stage 1 + stvx vr17, r0,r20 // stage 1; deferred storing saves 15 cycles (10%!) + stvx vr18, r15,r20 + + ld r14, -0x08(r1) + ld r15, -0x10(r1) + ld r16, -0x18(r1) + ld r17, -0x20(r1) + ld r18, -0x28(r1) + ld r19, -0x30(r1) + ld r20, -0x38(r1) + + blr + } +} + +// V3 in asm +__declspec(naked) int ComputeFlexedVertex_StreamOffset_V4( + int nThinFlexVertexCount, //r3 + CachedPosNorm_t *pThinFlexVerts,//r4 + int32 *pFirstThinFlexIndex, //r5 + mstudiovertanim_t * pVert, //r6 + uint32 nCurrentTag, //r7 + uint32 numVertsToProcess, //r8 + fltx4 w1234 //vr1 + ) +{ + __asm + { + std r14, -0x08(r1) + std r15, -0x10(r1) + std r16, -0x18(r1) + std r17, -0x20(r1) + std r18, -0x28(r1) + std r19, -0x30(r1) + + // let the compiler schedule the instructions, just use several registers to avoid dependencies + lau r14, g_sc256_255_special + lal r14, r14, g_sc256_255_special + lvx vr2, r0,r14 + + lau r15, g_f40011 + lal r15, r15, g_f40011 + lvx vr3, r0,r15 + + lau r16, g_perm_speed_side + lal r16, r16, g_perm_speed_side + lvx vr4, r0,r16 + + lau r17, g_perm_delta + lal r17, r17, g_perm_delta + lvx vr5, r0,r17 + + lau r18, g_perm_ndelta + lal r18, r18, g_perm_ndelta + lvx vr6, r0,r18 + + li r10, -1 + rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation + rldicl r10,r10,0,48 // r10 = 0x0000FFFF + + lau r14,g_nStreamOffset_prefetch + lal r14,r14,g_nStreamOffset_prefetch + lwz r11,0(r14) + + vxor vr8,vr8,vr8 + + li r15, 16 + li r24, MAXSTUDIOFLEXVERTS - 3 // critical number at which to stop processing + + mtctr r8 +label_start: + lhz r14, 0(r6) // int n = pVert->index; + dcbt r11,r16 + rldicl r14, r14, 2, 0 // r14 = n*4 + + + addi r16, r3, 2 + rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts + add r16, r16, r4 + stvx vr8, r0,r16 + stvx vr8, r15,r16 + + lvlx vr9,r0,r6 + lvrx vr10,r15,r6 + vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert) + + vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide) + vcsxwfp vr10,vr10,8 + vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb + + vperm vr11, vr8, vr9, vr5 //f3Delta = __vperm(f4Zero, packedVert, permuteDelta) + vcsxwfp vr11, vr11, 28 + vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta) + vcsxwfp vr12, vr12, 28 + + lwzx r17, r14, r5 // r17 = oldCache + xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag + subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32 + sradi r18,r18,32 // r18 = isCacheInvalid : form mask + + or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount + and r19,r19,r18 // r19 = newCache & isCacheInvalid + andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid + or r19, r19, r17 // r19 = updateCache + + rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32 + add r17, r17, r4 // r17 = pFlexedVertex + lvx vr13, r0,r17 // vr13 = vfPosition + lvx vr14, r15,r17 // vr14 = vfNormal + dcbt r11,r17 + + vpermwi128 vr15, vr10, 0x22 + vpermwi128 vr16, vr10, 0xF5 + vmulfp128 vr15, vr15, vr16 + vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight + + stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache + subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1); + + vmaddfp vr14, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + vmaddfp vr13, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + + stvx vr13, r0,r17 + stvx vr14, r15,r17 + + cmpw r3, r24 + bgt label_end + + addi r6, r6, 0x10 // pVert++ + bdnz label_start +label_end: + + ld r14, -0x08(r1) + ld r15, -0x10(r1) + ld r16, -0x18(r1) + ld r17, -0x20(r1) + ld r18, -0x28(r1) + ld r19, -0x30(r1) + + blr + } +} + + + +// V3 in asm +__declspec(naked) int ComputeFlexedVertexWrinkle_StreamOffset_V4( + int nThinFlexVertexCount, //r3 + CachedPosNorm_t *pThinFlexVerts,//r4 + int32 *pFirstThinFlexIndex, //r5 + mstudiovertanim_wrinkle_t * pVert,//r6 + uint32 nCurrentTag, //r7 + uint32 numVertsToProcess, //r8 + fltx4 w1234 //vr1 + ) +{ + __asm + { + std r14, -0x08(r1) + std r15, -0x10(r1) + std r16, -0x18(r1) + std r17, -0x20(r1) + std r18, -0x28(r1) + std r19, -0x30(r1) + + // let the compiler schedule the instructions, just use several registers to avoid dependencies + lau r14, g_sc256_255_special + lal r14, r14, g_sc256_255_special + lvx vr2, r0,r14 + + lau r15, g_f40011 + lal r15, r15, g_f40011 + lvx vr3, r0,r15 + + lau r16, g_perm_speed_side + lal r16, r16, g_perm_speed_side + lvx vr4, r0,r16 + + lau r17, g_perm_delta_wrinkle + lal r17, r17, g_perm_delta_wrinkle + lvx vr5, r0,r17 + + lau r18, g_perm_ndelta + lal r18, r18, g_perm_ndelta + lvx vr6, r0,r18 + + li r10, -1 + rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation + rldicl r10,r10,0,48 // r10 = 0x0000FFFF + + lau r14,g_nStreamOffset_prefetch + lal r14,r14,g_nStreamOffset_prefetch + lwz r11,0(r14) + + vxor vr8,vr8,vr8 + + li r15, 16 + li r24, MAXSTUDIOFLEXVERTS - 3 // critical number at which to stop processing + + mtctr r8 + label_start: + lhz r14, 0(r6) // int n = pVert->index; + dcbt r11,r16 + rldicl r14, r14, 2, 0 // r14 = n*4 + + + addi r16, r3, 2 + rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts + add r16, r16, r4 + stvx vr8, r0,r16 + stvx vr8, r15,r16 + + lvlx vr27,r15,r6 // f3PreDelta + lvlx vr9,r0,r6 + lvrx vr10,r15,r6 + vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert) + vrlimi128 vr27,vr9,7,0// f3PreDelta + + vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide) + vcsxwfp vr10,vr10,8 + vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb + + vperm vr11, vr8, vr27, vr5 //f3Delta = __vperm(f4Zero, f3PreDelta, permuteDelta) + vcsxwfp vr11, vr11, 28 + vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta) + vcsxwfp vr12, vr12, 28 + + lwzx r17, r14, r5 // r17 = oldCache + xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag + subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32 + sradi r18,r18,32 // r18 = isCacheInvalid : form mask + + or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount + and r19,r19,r18 // r19 = newCache & isCacheInvalid + andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid + or r19, r19, r17 // r19 = updateCache + + rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32 + add r17, r17, r4 // r17 = pFlexedVertex + lvx vr13, r0,r17 // vr13 = vfPosition + lvx vr14, r15,r17 // vr14 = vfNormal + dcbt r11,r17 + + vpermwi128 vr15, vr10, 0x22 + vpermwi128 vr16, vr10, 0xF5 + vmulfp128 vr15, vr15, vr16 + vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight + + stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache + subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1); + + vmaddfp vr14, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) + vmaddfp vr13, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) + + stvx vr13, r0,r17 + stvx vr14, r15,r17 + + cmpw r3, r24 + bgt label_end + + addi r6, r6, 0x12 // pVert++ + bdnz label_start + label_end: + + ld r14, -0x08(r1) + ld r15, -0x10(r1) + ld r16, -0x18(r1) + ld r17, -0x20(r1) + ld r18, -0x28(r1) + ld r19, -0x30(r1) + + blr + } +} + + + +// base for asm +int ComputeFlexedVertex_StreamOffset_V3(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234) +{ + fltx4 sc256_255_special = g_sc256_255_special; + fltx4 f40011 = g_f40011; + fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side); + fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta); + fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta); + //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0); + //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1); + fltx4 f4Zero = Four_Zeros; + + do + { + int n = pVert->index; + pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero(); + fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert); + fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); + // f4sb = {s,b,1-s,1-b} + + fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16); + fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16); + uint64 oldCache = uint32(pFirstThinFlexIndex[n]); + uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache + int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask + int64 isCacheValid = ~isCacheInvalid; + + int64 newCache = nCurrentTag | nThinFlexVertexCount; + int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid); + nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid; + + int nVertexIndex = updateCache & 0xFFFF; + + CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden + fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal); + fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position); + + // here we need to form the following vector to compute final w: + // {s(1-b), (1-s)(1-b), sb, (1-s)b} + //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1)); + fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5)); + fltx4 scWeight = __vmsum4fp(f4sbProd,w1234); + + pFirstThinFlexIndex[n] = updateCache; + StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal)); + StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition)); + + pVert ++; + } + while(--numVertsToProcess); // why doesn't this use bdnz?? + + return nThinFlexVertexCount; +} + + +// base for asm +int ComputeFlexedVertexWrinkle_StreamOffset_V3(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_wrinkle_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234) +{ + fltx4 sc256_255_special = g_sc256_255_special; + fltx4 f40011 = g_f40011; + fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side); + fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta_wrinkle); + fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta); + //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0); + //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1); + fltx4 f4Zero = Four_Zeros; + + do + { + int n = pVert->index; + pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero(); + fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert); + fltx4 f3PreDelta = __lvlx(pVert, 16); // f3Delta now contains only packed W component in high X halfword... + fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); + // f4sb = {s,b,1-s,1-b} + + + f3PreDelta = __vrlimi(f3PreDelta, packedVert, 7, 0); // don't rotate and move bytes 4..15 from packed vert to f3PreDelta + fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16); + fltx4 f3Delta = __vcfsx(__vperm(f4Zero, f3PreDelta, permuteDelta), 12+16); + uint64 oldCache = uint32(pFirstThinFlexIndex[n]); + uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache + int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask + int64 isCacheValid = ~isCacheInvalid; + + int64 newCache = nCurrentTag | nThinFlexVertexCount; + int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid); + nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid; + + int nVertexIndex = updateCache & 0xFFFF; + + CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden + fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal); + fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position); + + // here we need to form the following vector to compute final w: + // {s(1-b), (1-s)(1-b), sb, (1-s)b} + //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1)); + fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5)); + fltx4 scWeight = __vmsum4fp(f4sbProd,w1234); + + pFirstThinFlexIndex[n] = updateCache; + StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal)); + StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition)); + + pVert ++; + } + while(--numVertsToProcess); // why doesn't this use bdnz?? + + return nThinFlexVertexCount; +} + +// tried to pipeline in C++ +int ComputeFlexedVertex_StreamOffset_V2(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234) +{ + Assert(0 == (uint32(pVert) & 0xF)); + fltx4 sc256_255_special = g_sc256_255_special; + fltx4 f40011 = g_f40011; + fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side); + fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta); + fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta); + //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0); + //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1); + fltx4 f4Zero = Four_Zeros; + + fltx4 f4sb_st1, f3Delta_st1, f3NDelta_st1; + int32 updateCache_st1; + mstudiovertanim_t *pVertEnd = pVert + numVertsToProcess; + { + // stage 0 + int n = pVert->index; + pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero(); + fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert); + fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); // to be completely correct, we'll ned to multiply this with 256/255 + // f4sb = {s,b,1-s,1-b} + + fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16); + fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16); + uint64 oldCache = uint32(pFirstThinFlexIndex[n]); + uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache + int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask + int64 isCacheValid = ~isCacheInvalid; + + int64 newCache = nCurrentTag | nThinFlexVertexCount; + int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid); + nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid; + + pFirstThinFlexIndex[n] = updateCache; + + // prime next stage 1 + f4sb_st1 = f4sb; + f3Delta_st1 = f3Delta; + f3NDelta_st1 = f3NDelta; + updateCache_st1 = updateCache; + + pVert ++; + } + + while(pVert < pVertEnd) + { + // stage 1 + { + int nVertexIndex = updateCache_st1 & 0xFFFF; + + CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden + + fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal); + fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position); + + // here we need to form the following vector to compute final w: + // {s(1-b), (1-s)(1-b), sb, (1-s)b} + //fltx4 f4sbProd = MulSIMD(__vperm(f4sb_st1,f4sb_st1,permuteW0), __vperm(f4sb_st1,f4sb_st1,permuteW1)); + fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb_st1,0x22), __vpermwi(f4sb_st1,0xF5)); + fltx4 scWeight = __vmsum4fp(f4sbProd,w1234); + + StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta_st1, vfNormal)); + StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta_st1, vfPosition)); + } + + // stage 0 + { + int n = pVert->index; + pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero(); + fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert); + fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); // to be completely correct, we'll ned to multiply this with 256/255 + // f4sb = {s,b,1-s,1-b} + + fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16); + fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16); + uint64 oldCache = uint32(pFirstThinFlexIndex[n]); + uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache + int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask + int64 isCacheValid = ~isCacheInvalid; + + int64 newCache = nCurrentTag | nThinFlexVertexCount; + int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid); + nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid; + + pFirstThinFlexIndex[n] = updateCache; // this may be put wherever it doesn't mess up the other stores + + // prime next stage 1 + f4sb_st1 = f4sb; + updateCache_st1 = updateCache; + f3Delta_st1 = f3Delta; + f3NDelta_st1 = f3NDelta; + } + + pVert ++; + } + + // stage 1 + { + int nVertexIndex = updateCache_st1 & 0xFFFF; + + CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden + + fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal); + fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position); + + // here we need to form the following vector to compute final w: + // {s(1-b), (1-s)(1-b), sb, (1-s)b} + //fltx4 f4sbProd = MulSIMD(__vperm(f4sb_st1,f4sb_st1,permuteW0), __vperm(f4sb_st1,f4sb_st1,permuteW1)); + fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb_st1,0x22), __vpermwi(f4sb_st1,0xF5)); + fltx4 scWeight = __vmsum4fp(f4sbProd,w1234); + + StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta_st1, vfNormal)); + StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta_st1, vfPosition)); + } + return nThinFlexVertexCount; +} + +// branchless +int ComputeFlexedVertex_StreamOffset_V1(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234) +{ + Assert(0 == (uint32(pVert) & 0xF)); + fltx4 sc256_255_special = g_sc256_255_special; + fltx4 f40011 = g_f40011; + fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side); + fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta); + fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta); + //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0); + //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1); + fltx4 f4Zero = Four_Zeros; + mstudiovertanim_t *pVertEnd = pVert + numVertsToProcess; + do + { + int n = pVert->index; + pThinFlexVerts[nThinFlexVertexCount].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount].m_Normal.InitZero(); + fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert); + fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); + // f4sb = {s,b,1-s,1-b} + + fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16); + fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16); + uint64 oldCache = uint32(pFirstThinFlexIndex[n]); + uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache + int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask + int32 isCacheValid = ~isCacheInvalid; + + int32 newCache = nCurrentTag | nThinFlexVertexCount; + int32 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid); + nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid; + + int nVertexIndex = updateCache & 0xFFFF; + + CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden + fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal); + fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position); + + // here we need to form the following vector to compute final w: + // {s(1-b), (1-s)(1-b), sb, (1-s)b} + //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1)); + fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5)); + fltx4 scWeight = __vmsum4fp(f4sbProd,w1234); + + pFirstThinFlexIndex[n] = updateCache; + StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal)); + StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition)); + + pVert ++; + } + while(pVert < pVertEnd); // why doesn't this use CTR?? + + return nThinFlexVertexCount; +} + + +typedef int (*Fn_ComputeFlexedVertex_StreamOffset)(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234); +Fn_ComputeFlexedVertex_StreamOffset g_fn_ComputeFlexedVertex_StreamOffset[8] = +{ + NULL, + ComputeFlexedVertex_StreamOffset_V1, + ComputeFlexedVertex_StreamOffset_V2, + ComputeFlexedVertex_StreamOffset_V3, + ComputeFlexedVertex_StreamOffset_V4, + ComputeFlexedVertex_StreamOffset_V5, + ComputeFlexedVertex_StreamOffset_V6, + ComputeFlexedVertex_StreamOffset_V7 +}; + +typedef int (*Fn_ComputeFlexedVertexWrinkle_StreamOffset)(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_wrinkle_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234); +Fn_ComputeFlexedVertexWrinkle_StreamOffset g_fn_ComputeFlexedVertexWrinkle_StreamOffset[8] = +{ + NULL, + ComputeFlexedVertexWrinkle_StreamOffset_V3, + ComputeFlexedVertexWrinkle_StreamOffset_V3, + ComputeFlexedVertexWrinkle_StreamOffset_V3, + ComputeFlexedVertexWrinkle_StreamOffset_V4, + ComputeFlexedVertexWrinkle_StreamOffset_V4, + ComputeFlexedVertexWrinkle_StreamOffset_V4, + ComputeFlexedVertexWrinkle_StreamOffset_V7 +}; + + +inline float Diff(const CachedPosNorm_t&a, const CachedPosNorm_t&b) +{ + return a.m_Position.DistTo(b.m_Position) + a.m_Normal.DistTo(b.m_Normal); +} + +bool g_bBreakOnAssert = true; +void AlwaysAssert(bool mustBeTrue) +{ + if(!mustBeTrue) + { + Plat_DebugString("AlwaysAssert\n"); + if(g_bBreakOnAssert) + DebugBreak(); + } +} + +#endif + +template +void CCachedRenderData::ComputeFlexedVertex_StreamOffset<mstudiovertanim_t>( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, + mstudiovertanim_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 ); +template +void CCachedRenderData::ComputeFlexedVertex_StreamOffset<mstudiovertanim_wrinkle_t>( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, + mstudiovertanim_wrinkle_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 ); + +// vectorized +void CCachedRenderData::ComputeFlexedVertex_StreamOffset_Optimized( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, mstudiovertanim_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 ) +{ +#if PROFILE_THIS_FILE + CMiniProfilerGuard mpguard(&g_mp_morph); +#endif +#ifdef _X360 + int nMorphPath = g_cv_morph_path.GetInt(); + if(nMorphPath) + { + mstudiovertanim_t vertCountStruct; + vertCountStruct.index = vertCount; + /*for(uint32 i = 1; i< pflex->numverts; ++i) + if(pvanim[i-1].index > pvanim[i].index) + DebugBreak();*/ + + mstudiovertanim_t * pVertEnd; + { +#if PROFILE_THIS_FILE + CMiniProfilerGuard mpguard_lower_bound(&g_mp_morph_lower_bound); +#endif + pVertEnd = std::lower_bound(pvanim, pvanim + pflex->numverts, vertCountStruct, mstudiovertanim_t::CSortByIndex()); + } + + if(pvanim < pVertEnd) + { + union + { + fltx4 f4; + float f1[4]; + } weights; + weights.f1[0] = w1; + weights.f1[1] = w2; + weights.f1[2] = w3; + weights.f1[3] = w4; + uint32 nCurrentTag = uint32(m_CurrentTag)<<16; + int nThinFlexVertexCount = m_ThinFlexVertexCount; + int32 *pFirstThinFlexIndex = (int32*)m_pFirstThinFlexIndex; + CachedPosNorm_t *pThinFlexVerts = m_pThinFlexVerts; + uint64 numVertsToProcess = pVertEnd - pvanim; + nMorphPath = MIN(7,nMorphPath); + + /*static int maxVertsSaved = 0; + if(numVertsToProcess > maxVertsSaved) + { + maxVertsSaved = numVertsToProcess; + + FileHandle_t fh = g_pFullFileSystem->Open( "vertices.bin", "wb" ); + if(fh != FILESYSTEM_INVALID_HANDLE) + { + g_pFullFileSystem->Write(pvanim, sizeof(*pvanim) * numVertsToProcess, fh); + g_pFullFileSystem->Close(fh); + } + }*/ + + +#ifdef _DEBUG + if(0 == g_cv_morph_debug.GetInt()) +#endif + { + for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm.. + { + pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero(); + } + nThinFlexVertexCount = g_fn_ComputeFlexedVertex_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4); + } +#ifdef _DEBUG + else // Validation path inactive in release, since these static arrays consume 1MB + { + bool repeat = false; + static CachedPosNorm_t backupThinFlexVerts[MAXSTUDIOFLEXVERTS+1], checkThinFlexVerts[MAXSTUDIOFLEXVERTS+1]; + static CacheIndex_t backupFirstThinFlexIndex[MAXSTUDIOVERTS+1],checkFirstThinFlexIndex[MAXSTUDIOVERTS+1]; + int newThinFlexVertexCount ; + static int numRuns = 0; + ++numRuns; + memcpy(backupThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(backupFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex)); + do + { + for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm.. + { + pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero(); + } + + newThinFlexVertexCount = g_fn_ComputeFlexedVertex_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4); + memcpy(checkThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(checkFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex)); + memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex)); + + ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4); + AlwaysAssert(m_ThinFlexVertexCount == newThinFlexVertexCount); + for(int i = 0; i < newThinFlexVertexCount; ++i) + AlwaysAssert(Diff(checkThinFlexVerts[i], m_pThinFlexVerts[i]) < 1e-5f); + int indexOffset = m_pFirstThinFlexIndex - m_pThinFlexIndex; + for(int i = 0; i < numVertsToProcess; ++i) + AlwaysAssert(*(int*)&checkFirstThinFlexIndex[indexOffset + pvanim[i].index] == *(int*)&m_pThinFlexIndex[indexOffset + pvanim[i].index]); + + if(repeat) + { + m_ThinFlexVertexCount = nThinFlexVertexCount; + memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex)); + } + } + while(repeat); + nThinFlexVertexCount = newThinFlexVertexCount; + } +#endif + m_ThinFlexVertexCount = nThinFlexVertexCount; + } + } + else +#endif + { + ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4); + } +} + + +void CCachedRenderData::ComputeFlexedVertexWrinkle_StreamOffset_Optimized( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, mstudiovertanim_wrinkle_t *pvanim, int vertCount, float w1, float w2, float w3, float w4) +{ +#if PROFILE_THIS_FILE + CMiniProfilerGuard mpguard(&g_mp_morph); +#endif + +#ifdef _X360 + int nMorphPath = g_cv_morph_path.GetInt(); + if(nMorphPath) + { + mstudiovertanim_wrinkle_t vertCountStruct; + vertCountStruct.index = vertCount; + + mstudiovertanim_wrinkle_t * pVertEnd; + { +#if PROFILE_THIS_FILE + CMiniProfilerGuard mpguard_lower_bound(&g_mp_morph_lower_bound); +#endif + pVertEnd = std::lower_bound(pvanim, pvanim + pflex->numverts, vertCountStruct, mstudiovertanim_wrinkle_t::CSortByIndex()); + } + + if(pvanim < pVertEnd) + { + union + { + fltx4 f4; + float f1[4]; + } weights; + weights.f1[0] = w1; + weights.f1[1] = w2; + weights.f1[2] = w3; + weights.f1[3] = w4; + uint32 nCurrentTag = uint32(m_CurrentTag)<<16; + int nThinFlexVertexCount = m_ThinFlexVertexCount; + int32 *pFirstThinFlexIndex = (int32*)m_pFirstThinFlexIndex; + CachedPosNorm_t *pThinFlexVerts = m_pThinFlexVerts; + uint64 numVertsToProcess = pVertEnd - pvanim; + nMorphPath = MIN(7,nMorphPath); + +#ifdef _DEBUG + if(0 == g_cv_morph_debug.GetInt()) +#endif + { + for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm.. + { + pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero(); + } + nThinFlexVertexCount = g_fn_ComputeFlexedVertexWrinkle_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4); + } +#ifdef _DEBUG + else // Validation path inactive in release, since these static arrays consume 1MB + { + bool repeat = false; + static CachedPosNorm_t backupThinFlexVerts[MAXSTUDIOFLEXVERTS+1], checkThinFlexVerts[MAXSTUDIOFLEXVERTS+1]; + static CacheIndex_t backupFirstThinFlexIndex[MAXSTUDIOVERTS+1],checkFirstThinFlexIndex[MAXSTUDIOVERTS+1]; + int newThinFlexVertexCount ; + static int numRuns = 0; + ++numRuns; + memcpy(backupThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(backupFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex)); + do + { + for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm.. + { + pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero(); + pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero(); + } + + newThinFlexVertexCount = g_fn_ComputeFlexedVertexWrinkle_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4); + memcpy(checkThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(checkFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex)); + memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex)); + + ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4); + AlwaysAssert(m_ThinFlexVertexCount == newThinFlexVertexCount); + for(int i = 0; i < newThinFlexVertexCount; ++i) + AlwaysAssert(Diff(checkThinFlexVerts[i], m_pThinFlexVerts[i]) < 1e-5f); + int indexOffset = m_pFirstThinFlexIndex - m_pThinFlexIndex; + for(int i = 0; i < numVertsToProcess; ++i) + AlwaysAssert(*(int*)&checkFirstThinFlexIndex[indexOffset + pvanim[i].index] == *(int*)&m_pThinFlexIndex[indexOffset + pvanim[i].index]); + + if(repeat) + { + m_ThinFlexVertexCount = nThinFlexVertexCount; + memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts)); + memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex)); + } + } + while(repeat); + nThinFlexVertexCount = newThinFlexVertexCount; + } +#endif + m_ThinFlexVertexCount = nThinFlexVertexCount; + } + } + else +#endif + { + ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4); + } +} + +#endif // PLATFORM_WINDOWS
\ No newline at end of file |