From f56bb35301836e56582a575a75864392a0177875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20P=2E=20Tjern=C3=B8?= Date: Mon, 2 Dec 2013 19:31:46 -0800 Subject: Fix line endings. WHAMMY. --- mp/src/mathlib/sseconst.cpp | 2328 +++++++++++++++++++++---------------------- 1 file changed, 1164 insertions(+), 1164 deletions(-) (limited to 'mp/src/mathlib/sseconst.cpp') diff --git a/mp/src/mathlib/sseconst.cpp b/mp/src/mathlib/sseconst.cpp index d9ba06b2..2f923193 100644 --- a/mp/src/mathlib/sseconst.cpp +++ b/mp/src/mathlib/sseconst.cpp @@ -1,1164 +1,1164 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// -// -// Purpose: -// -//===========================================================================// - -#include "mathlib/ssemath.h" -#include "mathlib/ssequaternion.h" - -const fltx4 Four_PointFives={0.5,0.5,0.5,0.5}; -#ifndef _X360 -const fltx4 Four_Zeros={0.0,0.0,0.0,0.0}; -const fltx4 Four_Ones={1.0,1.0,1.0,1.0}; -#endif -const fltx4 Four_Twos={2.0,2.0,2.0,2.0}; -const fltx4 Four_Threes={3.0,3.0,3.0,3.0}; -const fltx4 Four_Fours={4.0,4.0,4.0,4.0}; -const fltx4 Four_Origin={0,0,0,1}; -const fltx4 Four_NegativeOnes={-1,-1,-1,-1}; - -const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) }; -const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) }; -const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) }; -const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) }; - -const fltx4 Four_Point225s={ .225, .225, .225, .225 }; -const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON}; - -const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; -const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX}; -const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. }; - -const fltx4 g_QuatMultRowSign[4] = -{ - { 1.0f, 1.0f, -1.0f, 1.0f }, - { -1.0f, 1.0f, 1.0f, 1.0f }, - { 1.0f, -1.0f, 1.0f, 1.0f }, - { -1.0f, -1.0f, -1.0f, 1.0f } -}; - -const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}; -const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; -const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe }; -const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; -const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0 -const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4 - -const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST = -{ - { 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF } -}; - -const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST = -{ - { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, - { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 }, - { 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 }, - { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 }, -}; - - - // FUNCTIONS - // NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE -// Generally speaking, you want to make sure SIMD math functions -// are inlined, because that gives the compiler much more latitude -// in instruction scheduling. It's not that the overhead of calling -// the function is particularly great; rather, many of the SIMD -// opcodes have long latencies, and if you have a sequence of -// several dependent ones inside a function call, the latencies -// stack up to create a big penalty. If the function is inlined, -// the compiler can interleave its operations with ones from the -// caller to better hide those latencies. Finally, on the 360, -// putting parameters or return values on the stack, and then -// reading them back within the next forty cycles, is a very -// severe penalty. So, as much as possible, you want to leave your -// data on the registers. - -// That said, there are certain occasions where it is appropriate -// to call into functions -- particularly for very large blocks -// of code that will spill most of the registers anyway. Unless your -// function is more than one screen long, yours is probably not one -// of those occasions. - - - -/// You can use this to rotate a long array of FourVectors all by the same -/// matrix. The first parameter is the head of the array. The second is the -/// number of vectors to rotate. The third is the matrix. -void FourVectors::RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ) -{ - Assert(numVectors > 0); - if ( numVectors == 0 ) - return; - - // Splat out each of the entries in the matrix to a fltx4. Do this - // in the order that we will need them, to hide latency. I'm - // avoiding making an array of them, so that they'll remain in - // registers. - fltx4 matSplat00, matSplat01, matSplat02, - matSplat10, matSplat11, matSplat12, - matSplat20, matSplat21, matSplat22; - - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the tranpose row of - // the matrix, but we don't really care about that. - fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); - - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); - - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); - - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - } - -#ifdef _X360 - // Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies - // and simplify prefetching. Named variables are deliberately used instead of arrays to - // ensure that the variables live on the registers instead of the stack (stack load/store - // is a serious penalty on 360). Nb: for prefetching to be most efficient here, the - // loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is - // 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. - // That way you can fetch the next 3 cache lines while you work on these three. - // If you do go this route, be sure to dissassemble and make sure it doesn't spill - // registers to stack as you do this; the cost of that will be excessive. Unroll the loop - // a little and just live with the fact that you'll be doing a couple of redundant dbcts - // (they don't cost you anything). Be aware that all three cores share L2 and it can only - // have eight cache lines fetching at a time. - fltx4 outX0, outY0, outZ0; // bank one of outputs - fltx4 outX1, outY1, outZ1; // bank two of outputs - - - // Because of instruction latencies and scheduling, it's actually faster to use adds and muls - // rather than madds. (Empirically determined by timing.) - const FourVectors * stop = pVectors + numVectors; - FourVectors * RESTRICT pVectNext; - // prime the pump. - if (numVectors & 0x01) - { - // odd number of vectors to process - // prime the 1 group of registers - pVectNext = pVectors++; - outX1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ), MulSIMD( pVectNext->z, matSplat02 ) ); - outY1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ), MulSIMD( pVectNext->z, matSplat12 ) ); - outZ1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ), MulSIMD( pVectNext->z, matSplat22 ) ); - } - else - { - // even number of total vectors to process; - // prime the zero group and jump into the middle of the loop - outX0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ), MulSIMD( pVectors->z, matSplat02 ) ); - outY0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ), MulSIMD( pVectors->z, matSplat12 ) ); - outZ0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ), MulSIMD( pVectors->z, matSplat22 ) ); - goto EVEN_CASE; - } - - // perform an even number of iterations through this loop. - while (pVectors < stop) - { - outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) ); - outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) ); - outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) ); - - pVectNext->x = outX1; - pVectNext->y = outY1; - pVectNext->z = outZ1; - -EVEN_CASE: - pVectNext = pVectors+1; - - outX1 = MaddSIMD( pVectNext->z, matSplat02, AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ) ); - outY1 = MaddSIMD( pVectNext->z, matSplat12, AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ) ); - outZ1 = MaddSIMD( pVectNext->z, matSplat22, AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ) ); - - pVectors->x = outX0; - pVectors->y = outY0; - pVectors->z = outZ0; - - pVectors += 2; - } - - // flush the last round of output - pVectNext->x = outX1; - pVectNext->y = outY1; - pVectNext->z = outZ1; -#else - // PC does not benefit from the unroll/scheduling above - fltx4 outX0, outY0, outZ0; // bank one of outputs - - - // Because of instruction latencies and scheduling, it's actually faster to use adds and muls - // rather than madds. (Empirically determined by timing.) - const FourVectors * stop = pVectors + numVectors; - - // perform an even number of iterations through this loop. - while (pVectors < stop) - { - outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) ); - outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) ); - outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) ); - - pVectors->x = outX0; - pVectors->y = outY0; - pVectors->z = outZ0; - pVectors++; - } -#endif -} - -#ifdef _X360 -// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. -void FourVectors_TransformManyGroupsOfEightBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ) -{ - Assert(numVectors > 0); - if ( numVectors == 0 ) - return; - - AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) || - (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." ); - - // Splat out each of the entries in the matrix to a fltx4. Do this - // in the order that we will need them, to hide latency. I'm - // avoiding making an array of them, so that they'll remain in - // registers. - fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS - matSplat10, matSplat11, matSplat12, matSplat13, - matSplat20, matSplat21, matSplat22, matSplat23; - - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the tranpose row of - // the matrix. - fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); - - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); - matSplat03 = SplatWSIMD(matCol0); - - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); - matSplat13 = SplatWSIMD(matCol1); - - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - matSplat23 = SplatWSIMD(matCol2); - } - - // this macro defines how to compute a specific row from an input and certain splat columns -#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm ) -#define WRITE(term, reg, toptr) toptr->term = reg - - // define result groups (we're going to have an eight-way unroll) - - fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS - fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp; - fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp; - fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp; - fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp; - fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp; - fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp; - fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp; - - -// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23) -#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23) -#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z - - /* - // stage 1 -- 6 ops for xyz, each w 12 cycle latency - res0X = MulSIMD( (invec)->y, matSplat01 ); - res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03); - // stage 2 -- 3 clocks for xyz - res0X = MaddSIMD( (invec)->x, matSplat00, res0X ); - // stage 3 -- 3 clocks for xyz - res0X = AddSIMD(res0X, res0Temp); - */ -#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat) -#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res ) -#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar - -#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - -#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - -#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - - FourVectors * RESTRICT inData = pVectors; - FourVectors * RESTRICT outData = pOut; - const FourVectors * const RESTRICT STOP = pVectors + numVectors; - - // Use techniques of loop scheduling to eliminate data hazards; process - // eight groups simultaneously so that we never have any operations stalling - // waiting for data. - // Note: this loop, while pretty fast, could be faster still -- you'll notice - // that it does all of its loads, then all computation, then writes everything - // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2, - // stage 3, and write, then throughput could be higher (probably by about 50%). - while (inData < STOP) - { - // start prefetching the three cache lines - // we'll hit two iterations from now - __dcbt( sizeof(FourVectors) * 16, inData ); - __dcbt( sizeof(FourVectors) * 16 + 128, inData ); - __dcbt( sizeof(FourVectors) * 16 + 256, inData ); - - // synchro - COMPUTE_STAGE1_GROUP(res0, inData + 0); - COMPUTE_STAGE1_GROUP(res1, inData + 1); - COMPUTE_STAGE1_GROUP(res2, inData + 2); - COMPUTE_STAGE1_GROUP(res3, inData + 3); - - COMPUTE_STAGE2_GROUP(res0, inData + 0); - COMPUTE_STAGE1_GROUP(res4, inData + 4); - COMPUTE_STAGE2_GROUP(res1, inData + 1); - COMPUTE_STAGE1_GROUP(res5, inData + 5); - COMPUTE_STAGE2_GROUP(res2, inData + 2); - COMPUTE_STAGE1_GROUP(res6, inData + 6); - COMPUTE_STAGE2_GROUP(res3, inData + 3); - COMPUTE_STAGE1_GROUP(res7, inData + 7); - - COMPUTE_STAGE3_GROUP(res0, inData + 0); - COMPUTE_STAGE2_GROUP(res4, inData + 4); - COMPUTE_STAGE3_GROUP(res1, inData + 1); - COMPUTE_STAGE2_GROUP(res5, inData + 5); - COMPUTE_STAGE3_GROUP(res2, inData + 2); - COMPUTE_STAGE2_GROUP(res6, inData + 6); - COMPUTE_STAGE3_GROUP(res3, inData + 3); - COMPUTE_STAGE2_GROUP(res7, inData + 7); - - COMPUTE_STAGE3_GROUP(res4, inData + 4); - WRITE_GROUP( outData + 0, res0 ); - COMPUTE_STAGE3_GROUP(res5, inData + 5); - WRITE_GROUP( outData + 1, res1 ); - COMPUTE_STAGE3_GROUP(res6, inData + 6); - WRITE_GROUP( outData + 2, res2 ); - COMPUTE_STAGE3_GROUP(res7, inData + 7); - WRITE_GROUP( outData + 3, res3 ); - - - WRITE_GROUP( outData + 4, res4 ); - WRITE_GROUP( outData + 5, res5 ); - WRITE_GROUP( outData + 6, res6 ); - WRITE_GROUP( outData + 7, res7 ); - - inData += 8; - outData += 8; - } - - -#undef COMPUTE -#undef WRITE -#undef COMPUTE_STAGE1_ROW -#undef COMPUTE_STAGE2_ROW -#undef COMPUTE_STAGE3_ROW -#undef COMPUTE_STAGE1_GROUP -#undef COMPUTE_STAGE2_GROUP -#undef COMPUTE_STAGE3_GROUP -#undef COMPUTE_GROUP -#undef WRITE_GROUP -} - -#ifdef _X360 -// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. This is the version -// to call when starting on a 128-byte-aligned address. -void FourVectors_TransformManyGroupsOfEightBy_128byteAligned(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ) -{ - /* If this has changed, you will need to change all the prefetches, * - * and groups of eight are no longer the ideal unit for iterating * - * on many vectors. */ - COMPILE_TIME_ASSERT( sizeof(FourVectors) == 48 ) ; - - Assert(numVectors > 0); - if ( numVectors == 0 ) - return; - - AssertMsg((numVectors & 0x07) == 0, "FourVectors_TransformManyGroupsOfEight called with numVectors % 8 != 0!"); - - // Assert alignment - AssertMsg( ( ( reinterpret_cast( pVectors ) & 127 ) == 0) && - ( ( reinterpret_cast(pOut) & 127 ) == 0), - "FourVectors_Transform..aligned called with non-128-byte-aligned buffers." ); - - // Assert non overlap - AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) || - (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." ); - - // Here's the plan. 8 four-vecs = 3 cache lines exactly. It takes about 400 cycles to process a group - // of eight, and cache latency is 600 cycles, so we try to prefetch two iterations ahead (eg fetch - // iteration 3 while working on iteration 1). In the case of the output, we can simply zero-flush - // the cache lines since we are sure to write into them. Because we're reading and fetching two ahead, - // we want to stop two away from the last iteration. - - // No matter what, we will need to prefetch the first two groups of eight of input (that's the - // first six cache lines) - __dcbt( 0, pVectors ); - __dcbt( 128, pVectors ); - __dcbt( 256, pVectors ); - __dcbt( 384, pVectors ); - __dcbt( 512, pVectors ); - __dcbt( 640, pVectors ); - - - // Splat out each of the entries in the matrix to a fltx4. Do this - // in the order that we will need them, to hide latency. I'm - // avoiding making an array of them, so that they'll remain in - // registers. - fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS - matSplat10, matSplat11, matSplat12, matSplat13, - matSplat20, matSplat21, matSplat22, matSplat23; - - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the tranpose row of - // the matrix. - fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); - - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); - matSplat03 = SplatWSIMD(matCol0); - - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); - matSplat13 = SplatWSIMD(matCol1); - - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - matSplat23 = SplatWSIMD(matCol2); - } - - // this macro defines how to compute a specific row from an input and certain splat columns -#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm ) -#define WRITE(term, reg, toptr) toptr->term = reg - - // define result groups (we're going to have an eight-way unroll) - - fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS - fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp; - fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp; - fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp; - fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp; - fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp; - fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp; - fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp; - - - // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23) -#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23) -#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z - - /* - // stage 1 -- 6 ops for xyz, each w 12 cycle latency - res0X = MulSIMD( (invec)->y, matSplat01 ); - res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03); - // stage 2 -- 3 clocks for xyz - res0X = MaddSIMD( (invec)->x, matSplat00, res0X ); - // stage 3 -- 3 clocks for xyz - res0X = AddSIMD(res0X, res0Temp); - */ -#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat) -#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res ) -#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar - -#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - -#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - -#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - - - // Okay. First do all but the last two turns of the crank; we don't want to overshoot with the flush-to-zero. - FourVectors * RESTRICT inData = pVectors; - FourVectors * RESTRICT outData = pOut; - const FourVectors * RESTRICT STOP; - if (numVectors > 16) - { - STOP = pVectors + numVectors - 16; - // flush the first two blocks we'll write into - __dcbz128( 0, outData ); - __dcbz128( 128, outData ); - __dcbz128( 256, outData ); - - while (inData < STOP) - { - // start prefetching the three cache lines - // we'll hit two iterations from now - __dcbt( sizeof(FourVectors) * 16, inData ); - __dcbt( sizeof(FourVectors) * 16 + 128, inData ); - __dcbt( sizeof(FourVectors) * 16 + 256, inData ); - - // synchro - COMPUTE_STAGE1_GROUP(res0, inData + 0); - COMPUTE_STAGE1_GROUP(res1, inData + 1); - COMPUTE_STAGE1_GROUP(res2, inData + 2); - COMPUTE_STAGE1_GROUP(res3, inData + 3); - - // pre-zero the three cache lines we'll overwrite - // in the next iteration - __dcbz128( 384, outData ); - __dcbz128( 512, outData ); - __dcbz128( 640, outData ); - - - COMPUTE_STAGE2_GROUP(res0, inData + 0); - COMPUTE_STAGE1_GROUP(res4, inData + 4); - COMPUTE_STAGE2_GROUP(res1, inData + 1); - COMPUTE_STAGE1_GROUP(res5, inData + 5); - COMPUTE_STAGE2_GROUP(res2, inData + 2); - COMPUTE_STAGE1_GROUP(res6, inData + 6); - COMPUTE_STAGE2_GROUP(res3, inData + 3); - COMPUTE_STAGE1_GROUP(res7, inData + 7); - - COMPUTE_STAGE3_GROUP(res0, inData + 0); - COMPUTE_STAGE2_GROUP(res4, inData + 4); - COMPUTE_STAGE3_GROUP(res1, inData + 1); - COMPUTE_STAGE2_GROUP(res5, inData + 5); - COMPUTE_STAGE3_GROUP(res2, inData + 2); - COMPUTE_STAGE2_GROUP(res6, inData + 6); - COMPUTE_STAGE3_GROUP(res3, inData + 3); - COMPUTE_STAGE2_GROUP(res7, inData + 7); - - COMPUTE_STAGE3_GROUP(res4, inData + 4); - WRITE_GROUP( outData + 0, res0 ); - COMPUTE_STAGE3_GROUP(res5, inData + 5); - WRITE_GROUP( outData + 1, res1 ); - COMPUTE_STAGE3_GROUP(res6, inData + 6); - WRITE_GROUP( outData + 2, res2 ); - COMPUTE_STAGE3_GROUP(res7, inData + 7); - WRITE_GROUP( outData + 3, res3 ); - - - WRITE_GROUP( outData + 4, res4 ); - WRITE_GROUP( outData + 5, res5 ); - WRITE_GROUP( outData + 6, res6 ); - WRITE_GROUP( outData + 7, res7 ); - - inData += 8; - outData += 8; - } - } - else if (numVectors == 16) - { - // zero out the exactly six cache lines we will write into - __dcbz128( 0, outData ); - __dcbz128( 128, outData ); - __dcbz128( 256, outData ); - __dcbz128( 384, outData ); - __dcbz128( 512, outData ); - __dcbz128( 640, outData ); - } - else if (numVectors == 8) - { - // zero out the exactly three cache lines we will write into - __dcbz128( 0, outData ); - __dcbz128( 128, outData ); - __dcbz128( 256, outData ); - } - else - { - AssertMsg(false, "Can't happen!"); - } - - // deal with the ultimate two groups (or, if we were fed - // less than 16 groups, the whole shebang) - STOP = pVectors + numVectors - 16; - - - // Use techniques of loop scheduling to eliminate data hazards; process - // eight groups simultaneously so that we never have any operations stalling - // waiting for data. - // Note: this loop, while pretty fast, could be faster still -- you'll notice - // that it does all of its loads, then all computation, then writes everything - // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2, - // stage 3, and write, then throughput could be higher (probably by about 50%). - while (inData < STOP) - { - // synchro - COMPUTE_STAGE1_GROUP(res0, inData + 0); - COMPUTE_STAGE1_GROUP(res1, inData + 1); - COMPUTE_STAGE1_GROUP(res2, inData + 2); - COMPUTE_STAGE1_GROUP(res3, inData + 3); - - COMPUTE_STAGE2_GROUP(res0, inData + 0); - COMPUTE_STAGE1_GROUP(res4, inData + 4); - COMPUTE_STAGE2_GROUP(res1, inData + 1); - COMPUTE_STAGE1_GROUP(res5, inData + 5); - COMPUTE_STAGE2_GROUP(res2, inData + 2); - COMPUTE_STAGE1_GROUP(res6, inData + 6); - COMPUTE_STAGE2_GROUP(res3, inData + 3); - COMPUTE_STAGE1_GROUP(res7, inData + 7); - - COMPUTE_STAGE3_GROUP(res0, inData + 0); - COMPUTE_STAGE2_GROUP(res4, inData + 4); - COMPUTE_STAGE3_GROUP(res1, inData + 1); - COMPUTE_STAGE2_GROUP(res5, inData + 5); - COMPUTE_STAGE3_GROUP(res2, inData + 2); - COMPUTE_STAGE2_GROUP(res6, inData + 6); - COMPUTE_STAGE3_GROUP(res3, inData + 3); - COMPUTE_STAGE2_GROUP(res7, inData + 7); - - COMPUTE_STAGE3_GROUP(res4, inData + 4); - WRITE_GROUP( outData + 0, res0 ); - COMPUTE_STAGE3_GROUP(res5, inData + 5); - WRITE_GROUP( outData + 1, res1 ); - COMPUTE_STAGE3_GROUP(res6, inData + 6); - WRITE_GROUP( outData + 2, res2 ); - COMPUTE_STAGE3_GROUP(res7, inData + 7); - WRITE_GROUP( outData + 3, res3 ); - - - WRITE_GROUP( outData + 4, res4 ); - WRITE_GROUP( outData + 5, res5 ); - WRITE_GROUP( outData + 6, res6 ); - WRITE_GROUP( outData + 7, res7 ); - - inData += 8; - outData += 8; - } - - -#undef COMPUTE -#undef WRITE -#undef COMPUTE_STAGE1_ROW -#undef COMPUTE_STAGE2_ROW -#undef COMPUTE_STAGE3_ROW -#undef COMPUTE_STAGE1_GROUP -#undef COMPUTE_STAGE2_GROUP -#undef COMPUTE_STAGE3_GROUP -#undef COMPUTE_GROUP -#undef WRITE_GROUP -} -#endif - -// Transform a long array of FourVectors by a given matrix. -void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ) -{ - Assert(numVectors > 0); - - AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) || - (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." ); - -#ifdef _X360 - // The really fast version of this function likes to operate on blocks of eight. So, chug through - // groups of eight, then deal with any leftovers. - int numVectorsRoundedToNearestEight = numVectors & (~0x07); - if (numVectors >= 8) - { - // aligned? - if ((reinterpret_cast(pVectors) & 127) == 0 && (reinterpret_cast(pOut) & 127) == 0) - { - FourVectors_TransformManyGroupsOfEightBy_128byteAligned(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut); - } - else - { - FourVectors_TransformManyGroupsOfEightBy(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut); - } - numVectors -= numVectorsRoundedToNearestEight; - pVectors += numVectorsRoundedToNearestEight; - pOut += numVectorsRoundedToNearestEight; - } -#endif - - // any left over? - if (numVectors > 0) - { - - // Splat out each of the entries in the matrix to a fltx4. Do this - // in the order that we will need them, to hide latency. I'm - // avoiding making an array of them, so that they'll remain in - // registers. - fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS - matSplat10, matSplat11, matSplat12, matSplat13, - matSplat20, matSplat21, matSplat22, matSplat23; - - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the transpose row of - // the matrix. - fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); - - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); - matSplat03 = SplatWSIMD(matCol0); - - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); - matSplat13 = SplatWSIMD(matCol1); - - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - matSplat23 = SplatWSIMD(matCol2); - } - - do - { - // Trust in the compiler to schedule these operations correctly: - pOut->x = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03))); - pOut->y = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13))); - pOut->z = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23))); - - ++pOut; - ++pVectors; - --numVectors; - } while(numVectors > 0); - } -} - -#ifdef _X360 -// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. -static void FourVectors_TransformManyGroupsOfEightBy_InPlace(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ) -{ - Assert(numVectors > 0); - if ( numVectors == 0 ) - return; - - // Prefetch line 1 and 2 - __dcbt(0,pVectors); - __dcbt(128,pVectors); - - // Splat out each of the entries in the matrix to a fltx4. Do this - // in the order that we will need them, to hide latency. I'm - // avoiding making an array of them, so that they'll remain in - // registers. - fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS - matSplat10, matSplat11, matSplat12, matSplat13, - matSplat20, matSplat21, matSplat22, matSplat23; - - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the tranpose row of - // the matrix. - fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); - - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); - matSplat03 = SplatWSIMD(matCol0); - - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); - matSplat13 = SplatWSIMD(matCol1); - - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - matSplat23 = SplatWSIMD(matCol2); - } - - // this macro defines how to compute a specific row from an input and certain splat columns -#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm ) -#define WRITE(term, reg, toptr) toptr->term = reg - - // define result groups (we're going to have an eight-way unroll) - - fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS - fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp; - fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp; - fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp; - fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp; - fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp; - fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp; - fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp; - - - // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23) -#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23) -#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z - - /* - // stage 1 -- 6 ops for xyz, each w 12 cycle latency - res0X = MulSIMD( (invec)->y, matSplat01 ); - res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03); - // stage 2 -- 3 clocks for xyz - res0X = MaddSIMD( (invec)->x, matSplat00, res0X ); - // stage 3 -- 3 clocks for xyz - res0X = AddSIMD(res0X, res0Temp); - */ -#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat) -#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res ) -#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar - -#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - -#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - -#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ - COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ - COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) - - const FourVectors * const RESTRICT STOP = pVectors + numVectors; - - // Use techniques of loop scheduling to eliminate data hazards; process - // eight groups simultaneously so that we never have any operations stalling - // waiting for data. - // Note: this loop, while pretty fast, could be faster still -- you'll notice - // that it does all of its loads, then all computation, then writes everything - // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2, - // stage 3, and write, then throughput could be higher (probably by about 50%). - while (pVectors < STOP) - { - // start prefetching the three cache lines - // we'll hit two iterations from now - __dcbt( sizeof(FourVectors) * 16, pVectors ); - __dcbt( sizeof(FourVectors) * 16 + 128, pVectors ); - __dcbt( sizeof(FourVectors) * 16 + 256, pVectors ); - - // synchro - COMPUTE_STAGE1_GROUP(res0, pVectors + 0); - COMPUTE_STAGE1_GROUP(res1, pVectors + 1); - COMPUTE_STAGE1_GROUP(res2, pVectors + 2); - COMPUTE_STAGE1_GROUP(res3, pVectors + 3); - - COMPUTE_STAGE2_GROUP(res0, pVectors + 0); - COMPUTE_STAGE1_GROUP(res4, pVectors + 4); - COMPUTE_STAGE2_GROUP(res1, pVectors + 1); - COMPUTE_STAGE1_GROUP(res5, pVectors + 5); - COMPUTE_STAGE2_GROUP(res2, pVectors + 2); - COMPUTE_STAGE1_GROUP(res6, pVectors + 6); - COMPUTE_STAGE2_GROUP(res3, pVectors + 3); - COMPUTE_STAGE1_GROUP(res7, pVectors + 7); - - COMPUTE_STAGE3_GROUP(res0, pVectors + 0); - COMPUTE_STAGE2_GROUP(res4, pVectors + 4); - COMPUTE_STAGE3_GROUP(res1, pVectors + 1); - COMPUTE_STAGE2_GROUP(res5, pVectors + 5); - COMPUTE_STAGE3_GROUP(res2, pVectors + 2); - COMPUTE_STAGE2_GROUP(res6, pVectors + 6); - COMPUTE_STAGE3_GROUP(res3, pVectors + 3); - COMPUTE_STAGE2_GROUP(res7, pVectors + 7); - - COMPUTE_STAGE3_GROUP(res4, pVectors + 4); - WRITE_GROUP( pVectors + 0, res0 ); - COMPUTE_STAGE3_GROUP(res5, pVectors + 5); - WRITE_GROUP( pVectors + 1, res1 ); - COMPUTE_STAGE3_GROUP(res6, pVectors + 6); - WRITE_GROUP( pVectors + 2, res2 ); - COMPUTE_STAGE3_GROUP(res7, pVectors + 7); - WRITE_GROUP( pVectors + 3, res3 ); - - WRITE_GROUP( pVectors + 4, res4 ); - WRITE_GROUP( pVectors + 5, res5 ); - WRITE_GROUP( pVectors + 6, res6 ); - WRITE_GROUP( pVectors + 7, res7 ); - - pVectors += 8; - } - - -#undef COMPUTE -#undef WRITE -#undef COMPUTE_STAGE1_ROW -#undef COMPUTE_STAGE2_ROW -#undef COMPUTE_STAGE3_ROW -#undef COMPUTE_STAGE1_GROUP -#undef COMPUTE_STAGE2_GROUP -#undef COMPUTE_STAGE3_GROUP -#undef COMPUTE_GROUP -#undef WRITE_GROUP -} -#endif - -// In-place version of above. It's necessary to have this, rather than just allowing pOut and pVectors -// to equal each other, because of the semantics of RESTRICT: pVectors and pOut must not be allowed -// to alias. (Simply un-restricting the pointers results in very poor scheduling.) -void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ) -{ - Assert(numVectors > 0); - -#ifdef _X360 - // The really fast version of this function likes to operate on blocks of eight. So, chug through - // groups of eight, then deal with any leftovers. - int numVectorsRoundedToNearestEight = numVectors & (~0x07); - if (numVectors >= 8) - { - FourVectors_TransformManyGroupsOfEightBy_InPlace(pVectors, numVectorsRoundedToNearestEight, rotationMatrix); - numVectors -= numVectorsRoundedToNearestEight; - pVectors += numVectorsRoundedToNearestEight; - } -#endif - - // any left over? - if (numVectors > 0) - { - - // Splat out each of the entries in the matrix to a fltx4. Do this - // in the order that we will need them, to hide latency. I'm - // avoiding making an array of them, so that they'll remain in - // registers. - fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS - matSplat10, matSplat11, matSplat12, matSplat13, - matSplat20, matSplat21, matSplat22, matSplat23; - - { - // Load the matrix into local vectors. Sadly, matrix3x4_ts are - // often unaligned. The w components will be the transpose row of - // the matrix. - fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); - fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); - fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); - - matSplat00 = SplatXSIMD(matCol0); - matSplat01 = SplatYSIMD(matCol0); - matSplat02 = SplatZSIMD(matCol0); - matSplat03 = SplatWSIMD(matCol0); - - matSplat10 = SplatXSIMD(matCol1); - matSplat11 = SplatYSIMD(matCol1); - matSplat12 = SplatZSIMD(matCol1); - matSplat13 = SplatWSIMD(matCol1); - - matSplat20 = SplatXSIMD(matCol2); - matSplat21 = SplatYSIMD(matCol2); - matSplat22 = SplatZSIMD(matCol2); - matSplat23 = SplatWSIMD(matCol2); - } - - do - { - fltx4 resultX, resultY, resultZ; - // Trust in the compiler to schedule these operations correctly: - resultX = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03))); - resultY = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13))); - resultZ = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23))); - - pVectors->x = resultX; - pVectors->y = resultY; - pVectors->z = resultZ; - - ++pVectors; - --numVectors; - } while(numVectors > 0); - } -} - - -#endif - -// Transform many (horizontal) points in-place by a 3x4 matrix, -// here already loaded onto three fltx4 registers but not transposed. -// The points must be stored as 16-byte aligned. They are points -// and not vectors because we assume the w-component to be 1. -#ifdef _X360 -void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow0, FLTX4 mRow1, FLTX4 mRow2) -{ - /************************************************** - * Here is an elaborate and carefully scheduled * - * algorithm nicked from xboxmath.inl and hacked * - * up for 3x4 matrices. * - **************************************************/ - - COMPILE_TIME_ASSERT(sizeof(VectorAligned) == sizeof(XMFLOAT4)); // VectorAligned's need to be 16 bytes - - XMVECTOR R0[8], R1[8], R2[8]; - XMVECTOR vIn[8]; - - // C_ASSERT(UnrollCount == 8); - // C_ASSERT(sizeof(XMFLOAT4) == 16); - Assert(pVectors); - Assert(((UINT_PTR)pVectors & 3) == 0); // assert alignment - - UINT GroupIndex; - - VectorAligned * RESTRICT vCurrent = pVectors; - // sentinel pointers - VectorAligned * vStreamEnd, *vStreamGroupBase, *vStreamGroupEnd; - - { - // cook up the pointers from integer math. Necessary because otherwise we LHS all over - // the place. (Odd that this doesn't happen to the xbox math.) - - UINT_PTR InputVector = (UINT_PTR)pVectors; - UINT_PTR InputStreamEnd = InputVector + numVectors * sizeof(XMFLOAT4); - // compute start and end points on 128-byte alignment - UINT_PTR InputStreamCGroupBase = XMMin(InputVector + (XM_CACHE_LINE_SIZE - 1), InputStreamEnd) & ~(XM_CACHE_LINE_SIZE - 1); - UINT_PTR InputStreamCGroupEnd = InputStreamCGroupBase + ((InputStreamEnd - InputStreamCGroupBase) & ~(4 * XM_CACHE_LINE_SIZE - 1)); - - vStreamEnd = (VectorAligned *)InputStreamEnd; - vStreamGroupBase = (VectorAligned *)InputStreamCGroupBase; - vStreamGroupEnd = (VectorAligned *)InputStreamCGroupEnd; - } - - - __dcbt(0, vStreamGroupBase); - __dcbt(XM_CACHE_LINE_SIZE, vStreamGroupBase); - __dcbt(XM_CACHE_LINE_SIZE * 2, vStreamGroupBase); - __dcbt(XM_CACHE_LINE_SIZE * 3, vStreamGroupBase); - - while (vCurrent < vStreamGroupBase) - { - fltx4 vec = __lvx(vCurrent->Base(), 0); - - R0[0] = __vmsum4fp(vec, mRow0); - R1[0] = __vmsum4fp(vec, mRow1); - R2[0] = __vmsum4fp(vec, mRow2); - - __stvewx(R0[0], vCurrent->Base(), 0); - __stvewx(R1[0], vCurrent->Base(), 4); - __stvewx(R2[0], vCurrent->Base(), 8); - - vCurrent++; - } - - while (vCurrent < vStreamGroupEnd) - { - __dcbt(XM_CACHE_LINE_SIZE * 4, vCurrent); - __dcbt(XM_CACHE_LINE_SIZE * 5, vCurrent); - __dcbt(XM_CACHE_LINE_SIZE * 6, vCurrent); - __dcbt(XM_CACHE_LINE_SIZE * 7, vCurrent); - - for (GroupIndex = 0; GroupIndex < 4; GroupIndex++) - { - // all kinds of LHS on this pointer. Why? - VectorAligned* OutputVector = vCurrent; - - vIn[0] = __lvx(vCurrent->Base(), 0); - vCurrent++; - vIn[1] = __lvx(vCurrent->Base(), 0); - vCurrent++; - vIn[2] = __lvx(vCurrent->Base(), 0); - vCurrent++; - vIn[3] = __lvx(vCurrent->Base(), 0); - vCurrent++; - vIn[4] = __lvx(vCurrent->Base(), 0); - vCurrent++; - vIn[5] = __lvx(vCurrent->Base(), 0); - vCurrent++; - vIn[6] = __lvx(vCurrent->Base(), 0); - vCurrent++; - vIn[7] = __lvx(vCurrent->Base(), 0); - vCurrent++; - - R0[0] = __vmsum4fp(vIn[0], mRow0); - R1[0] = __vmsum4fp(vIn[0], mRow1); - R2[0] = __vmsum4fp(vIn[0], mRow2); - - R0[1] = __vmsum4fp(vIn[1], mRow0); - R1[1] = __vmsum4fp(vIn[1], mRow1); - R2[1] = __vmsum4fp(vIn[1], mRow2); - - R0[2] = __vmsum4fp(vIn[2], mRow0); - R1[2] = __vmsum4fp(vIn[2], mRow1); - R2[2] = __vmsum4fp(vIn[2], mRow2); - - R0[3] = __vmsum4fp(vIn[3], mRow0); - R1[3] = __vmsum4fp(vIn[3], mRow1); - R2[3] = __vmsum4fp(vIn[3], mRow2); - - R0[4] = __vmsum4fp(vIn[4], mRow0); - R1[4] = __vmsum4fp(vIn[4], mRow1); - R2[4] = __vmsum4fp(vIn[4], mRow2); - - R0[5] = __vmsum4fp(vIn[5], mRow0); - R1[5] = __vmsum4fp(vIn[5], mRow1); - R2[5] = __vmsum4fp(vIn[5], mRow2); - - R0[6] = __vmsum4fp(vIn[6], mRow0); - R1[6] = __vmsum4fp(vIn[6], mRow1); - R2[6] = __vmsum4fp(vIn[6], mRow2); - - R0[7] = __vmsum4fp(vIn[7], mRow0); - R1[7] = __vmsum4fp(vIn[7], mRow1); - R2[7] = __vmsum4fp(vIn[7], mRow2); - - __stvewx(R0[0], OutputVector, 0); - __stvewx(R1[0], OutputVector, 4); - __stvewx(R2[0], OutputVector, 8); - OutputVector++; - - __stvewx(R0[1], OutputVector, 0); - __stvewx(R1[1], OutputVector, 4); - __stvewx(R2[1], OutputVector, 8); - OutputVector++; - - __stvewx(R0[2], OutputVector, 0); - __stvewx(R1[2], OutputVector, 4); - __stvewx(R2[2], OutputVector, 8); - OutputVector++; - - __stvewx(R0[3], OutputVector, 0); - __stvewx(R1[3], OutputVector, 4); - __stvewx(R2[3], OutputVector, 8); - OutputVector++; - - __stvewx(R0[4], OutputVector, 0); - __stvewx(R1[4], OutputVector, 4); - __stvewx(R2[4], OutputVector, 8); - OutputVector++; - - __stvewx(R0[5], OutputVector, 0); - __stvewx(R1[5], OutputVector, 4); - __stvewx(R2[5], OutputVector, 8); - OutputVector++; - - __stvewx(R0[6], OutputVector, 0); - __stvewx(R1[6], OutputVector, 4); - __stvewx(R2[6], OutputVector, 8); - OutputVector++; - - __stvewx(R0[7], OutputVector, 0); - __stvewx(R1[7], OutputVector, 4); - __stvewx(R2[7], OutputVector, 8); - OutputVector++; - } - } - - while (vCurrent < vStreamEnd) - { - vIn[0] = __lvx(vCurrent->Base(), 0); - - R0[0] = __vmsum4fp(vIn[0], mRow0); - R1[0] = __vmsum4fp(vIn[0], mRow1); - R2[0] = __vmsum4fp(vIn[0], mRow2); - - __stvewx(R0[0], vCurrent->Base(), 0); - __stvewx(R1[0], vCurrent->Base(), 4); - __stvewx(R2[0], vCurrent->Base(), 8); - - vCurrent++; - } - - -} -#endif +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//===========================================================================// + +#include "mathlib/ssemath.h" +#include "mathlib/ssequaternion.h" + +const fltx4 Four_PointFives={0.5,0.5,0.5,0.5}; +#ifndef _X360 +const fltx4 Four_Zeros={0.0,0.0,0.0,0.0}; +const fltx4 Four_Ones={1.0,1.0,1.0,1.0}; +#endif +const fltx4 Four_Twos={2.0,2.0,2.0,2.0}; +const fltx4 Four_Threes={3.0,3.0,3.0,3.0}; +const fltx4 Four_Fours={4.0,4.0,4.0,4.0}; +const fltx4 Four_Origin={0,0,0,1}; +const fltx4 Four_NegativeOnes={-1,-1,-1,-1}; + +const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) }; +const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) }; +const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) }; +const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) }; + +const fltx4 Four_Point225s={ .225, .225, .225, .225 }; +const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON}; + +const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; +const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX}; +const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. }; + +const fltx4 g_QuatMultRowSign[4] = +{ + { 1.0f, 1.0f, -1.0f, 1.0f }, + { -1.0f, 1.0f, 1.0f, 1.0f }, + { 1.0f, -1.0f, 1.0f, 1.0f }, + { -1.0f, -1.0f, -1.0f, 1.0f } +}; + +const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}; +const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe }; +const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; +const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0 +const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4 + +const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST = +{ + { 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF } +}; + +const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST = +{ + { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, + { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 }, + { 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 }, + { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 }, +}; + + + // FUNCTIONS + // NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE +// Generally speaking, you want to make sure SIMD math functions +// are inlined, because that gives the compiler much more latitude +// in instruction scheduling. It's not that the overhead of calling +// the function is particularly great; rather, many of the SIMD +// opcodes have long latencies, and if you have a sequence of +// several dependent ones inside a function call, the latencies +// stack up to create a big penalty. If the function is inlined, +// the compiler can interleave its operations with ones from the +// caller to better hide those latencies. Finally, on the 360, +// putting parameters or return values on the stack, and then +// reading them back within the next forty cycles, is a very +// severe penalty. So, as much as possible, you want to leave your +// data on the registers. + +// That said, there are certain occasions where it is appropriate +// to call into functions -- particularly for very large blocks +// of code that will spill most of the registers anyway. Unless your +// function is more than one screen long, yours is probably not one +// of those occasions. + + + +/// You can use this to rotate a long array of FourVectors all by the same +/// matrix. The first parameter is the head of the array. The second is the +/// number of vectors to rotate. The third is the matrix. +void FourVectors::RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ) +{ + Assert(numVectors > 0); + if ( numVectors == 0 ) + return; + + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, + matSplat10, matSplat11, matSplat12, + matSplat20, matSplat21, matSplat22; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); + + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); + + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); + + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); + } + +#ifdef _X360 + // Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies + // and simplify prefetching. Named variables are deliberately used instead of arrays to + // ensure that the variables live on the registers instead of the stack (stack load/store + // is a serious penalty on 360). Nb: for prefetching to be most efficient here, the + // loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is + // 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. + // That way you can fetch the next 3 cache lines while you work on these three. + // If you do go this route, be sure to dissassemble and make sure it doesn't spill + // registers to stack as you do this; the cost of that will be excessive. Unroll the loop + // a little and just live with the fact that you'll be doing a couple of redundant dbcts + // (they don't cost you anything). Be aware that all three cores share L2 and it can only + // have eight cache lines fetching at a time. + fltx4 outX0, outY0, outZ0; // bank one of outputs + fltx4 outX1, outY1, outZ1; // bank two of outputs + + + // Because of instruction latencies and scheduling, it's actually faster to use adds and muls + // rather than madds. (Empirically determined by timing.) + const FourVectors * stop = pVectors + numVectors; + FourVectors * RESTRICT pVectNext; + // prime the pump. + if (numVectors & 0x01) + { + // odd number of vectors to process + // prime the 1 group of registers + pVectNext = pVectors++; + outX1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ), MulSIMD( pVectNext->z, matSplat02 ) ); + outY1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ), MulSIMD( pVectNext->z, matSplat12 ) ); + outZ1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ), MulSIMD( pVectNext->z, matSplat22 ) ); + } + else + { + // even number of total vectors to process; + // prime the zero group and jump into the middle of the loop + outX0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ), MulSIMD( pVectors->z, matSplat02 ) ); + outY0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ), MulSIMD( pVectors->z, matSplat12 ) ); + outZ0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ), MulSIMD( pVectors->z, matSplat22 ) ); + goto EVEN_CASE; + } + + // perform an even number of iterations through this loop. + while (pVectors < stop) + { + outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) ); + outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) ); + outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) ); + + pVectNext->x = outX1; + pVectNext->y = outY1; + pVectNext->z = outZ1; + +EVEN_CASE: + pVectNext = pVectors+1; + + outX1 = MaddSIMD( pVectNext->z, matSplat02, AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ) ); + outY1 = MaddSIMD( pVectNext->z, matSplat12, AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ) ); + outZ1 = MaddSIMD( pVectNext->z, matSplat22, AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ) ); + + pVectors->x = outX0; + pVectors->y = outY0; + pVectors->z = outZ0; + + pVectors += 2; + } + + // flush the last round of output + pVectNext->x = outX1; + pVectNext->y = outY1; + pVectNext->z = outZ1; +#else + // PC does not benefit from the unroll/scheduling above + fltx4 outX0, outY0, outZ0; // bank one of outputs + + + // Because of instruction latencies and scheduling, it's actually faster to use adds and muls + // rather than madds. (Empirically determined by timing.) + const FourVectors * stop = pVectors + numVectors; + + // perform an even number of iterations through this loop. + while (pVectors < stop) + { + outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) ); + outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) ); + outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) ); + + pVectors->x = outX0; + pVectors->y = outY0; + pVectors->z = outZ0; + pVectors++; + } +#endif +} + +#ifdef _X360 +// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. +void FourVectors_TransformManyGroupsOfEightBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ) +{ + Assert(numVectors > 0); + if ( numVectors == 0 ) + return; + + AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) || + (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." ); + + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS + matSplat10, matSplat11, matSplat12, matSplat13, + matSplat20, matSplat21, matSplat22, matSplat23; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix. + fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); + + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); + matSplat03 = SplatWSIMD(matCol0); + + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); + matSplat13 = SplatWSIMD(matCol1); + + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); + matSplat23 = SplatWSIMD(matCol2); + } + + // this macro defines how to compute a specific row from an input and certain splat columns +#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm ) +#define WRITE(term, reg, toptr) toptr->term = reg + + // define result groups (we're going to have an eight-way unroll) + + fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS + fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp; + fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp; + fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp; + fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp; + fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp; + fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp; + fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp; + + +// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23) +#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23) +#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z + + /* + // stage 1 -- 6 ops for xyz, each w 12 cycle latency + res0X = MulSIMD( (invec)->y, matSplat01 ); + res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03); + // stage 2 -- 3 clocks for xyz + res0X = MaddSIMD( (invec)->x, matSplat00, res0X ); + // stage 3 -- 3 clocks for xyz + res0X = AddSIMD(res0X, res0Temp); + */ +#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat) +#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res ) +#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar + +#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + +#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + +#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + + FourVectors * RESTRICT inData = pVectors; + FourVectors * RESTRICT outData = pOut; + const FourVectors * const RESTRICT STOP = pVectors + numVectors; + + // Use techniques of loop scheduling to eliminate data hazards; process + // eight groups simultaneously so that we never have any operations stalling + // waiting for data. + // Note: this loop, while pretty fast, could be faster still -- you'll notice + // that it does all of its loads, then all computation, then writes everything + // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2, + // stage 3, and write, then throughput could be higher (probably by about 50%). + while (inData < STOP) + { + // start prefetching the three cache lines + // we'll hit two iterations from now + __dcbt( sizeof(FourVectors) * 16, inData ); + __dcbt( sizeof(FourVectors) * 16 + 128, inData ); + __dcbt( sizeof(FourVectors) * 16 + 256, inData ); + + // synchro + COMPUTE_STAGE1_GROUP(res0, inData + 0); + COMPUTE_STAGE1_GROUP(res1, inData + 1); + COMPUTE_STAGE1_GROUP(res2, inData + 2); + COMPUTE_STAGE1_GROUP(res3, inData + 3); + + COMPUTE_STAGE2_GROUP(res0, inData + 0); + COMPUTE_STAGE1_GROUP(res4, inData + 4); + COMPUTE_STAGE2_GROUP(res1, inData + 1); + COMPUTE_STAGE1_GROUP(res5, inData + 5); + COMPUTE_STAGE2_GROUP(res2, inData + 2); + COMPUTE_STAGE1_GROUP(res6, inData + 6); + COMPUTE_STAGE2_GROUP(res3, inData + 3); + COMPUTE_STAGE1_GROUP(res7, inData + 7); + + COMPUTE_STAGE3_GROUP(res0, inData + 0); + COMPUTE_STAGE2_GROUP(res4, inData + 4); + COMPUTE_STAGE3_GROUP(res1, inData + 1); + COMPUTE_STAGE2_GROUP(res5, inData + 5); + COMPUTE_STAGE3_GROUP(res2, inData + 2); + COMPUTE_STAGE2_GROUP(res6, inData + 6); + COMPUTE_STAGE3_GROUP(res3, inData + 3); + COMPUTE_STAGE2_GROUP(res7, inData + 7); + + COMPUTE_STAGE3_GROUP(res4, inData + 4); + WRITE_GROUP( outData + 0, res0 ); + COMPUTE_STAGE3_GROUP(res5, inData + 5); + WRITE_GROUP( outData + 1, res1 ); + COMPUTE_STAGE3_GROUP(res6, inData + 6); + WRITE_GROUP( outData + 2, res2 ); + COMPUTE_STAGE3_GROUP(res7, inData + 7); + WRITE_GROUP( outData + 3, res3 ); + + + WRITE_GROUP( outData + 4, res4 ); + WRITE_GROUP( outData + 5, res5 ); + WRITE_GROUP( outData + 6, res6 ); + WRITE_GROUP( outData + 7, res7 ); + + inData += 8; + outData += 8; + } + + +#undef COMPUTE +#undef WRITE +#undef COMPUTE_STAGE1_ROW +#undef COMPUTE_STAGE2_ROW +#undef COMPUTE_STAGE3_ROW +#undef COMPUTE_STAGE1_GROUP +#undef COMPUTE_STAGE2_GROUP +#undef COMPUTE_STAGE3_GROUP +#undef COMPUTE_GROUP +#undef WRITE_GROUP +} + +#ifdef _X360 +// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. This is the version +// to call when starting on a 128-byte-aligned address. +void FourVectors_TransformManyGroupsOfEightBy_128byteAligned(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ) +{ + /* If this has changed, you will need to change all the prefetches, * + * and groups of eight are no longer the ideal unit for iterating * + * on many vectors. */ + COMPILE_TIME_ASSERT( sizeof(FourVectors) == 48 ) ; + + Assert(numVectors > 0); + if ( numVectors == 0 ) + return; + + AssertMsg((numVectors & 0x07) == 0, "FourVectors_TransformManyGroupsOfEight called with numVectors % 8 != 0!"); + + // Assert alignment + AssertMsg( ( ( reinterpret_cast( pVectors ) & 127 ) == 0) && + ( ( reinterpret_cast(pOut) & 127 ) == 0), + "FourVectors_Transform..aligned called with non-128-byte-aligned buffers." ); + + // Assert non overlap + AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) || + (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." ); + + // Here's the plan. 8 four-vecs = 3 cache lines exactly. It takes about 400 cycles to process a group + // of eight, and cache latency is 600 cycles, so we try to prefetch two iterations ahead (eg fetch + // iteration 3 while working on iteration 1). In the case of the output, we can simply zero-flush + // the cache lines since we are sure to write into them. Because we're reading and fetching two ahead, + // we want to stop two away from the last iteration. + + // No matter what, we will need to prefetch the first two groups of eight of input (that's the + // first six cache lines) + __dcbt( 0, pVectors ); + __dcbt( 128, pVectors ); + __dcbt( 256, pVectors ); + __dcbt( 384, pVectors ); + __dcbt( 512, pVectors ); + __dcbt( 640, pVectors ); + + + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS + matSplat10, matSplat11, matSplat12, matSplat13, + matSplat20, matSplat21, matSplat22, matSplat23; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix. + fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); + + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); + matSplat03 = SplatWSIMD(matCol0); + + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); + matSplat13 = SplatWSIMD(matCol1); + + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); + matSplat23 = SplatWSIMD(matCol2); + } + + // this macro defines how to compute a specific row from an input and certain splat columns +#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm ) +#define WRITE(term, reg, toptr) toptr->term = reg + + // define result groups (we're going to have an eight-way unroll) + + fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS + fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp; + fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp; + fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp; + fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp; + fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp; + fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp; + fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp; + + + // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23) +#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23) +#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z + + /* + // stage 1 -- 6 ops for xyz, each w 12 cycle latency + res0X = MulSIMD( (invec)->y, matSplat01 ); + res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03); + // stage 2 -- 3 clocks for xyz + res0X = MaddSIMD( (invec)->x, matSplat00, res0X ); + // stage 3 -- 3 clocks for xyz + res0X = AddSIMD(res0X, res0Temp); + */ +#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat) +#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res ) +#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar + +#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + +#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + +#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + + + // Okay. First do all but the last two turns of the crank; we don't want to overshoot with the flush-to-zero. + FourVectors * RESTRICT inData = pVectors; + FourVectors * RESTRICT outData = pOut; + const FourVectors * RESTRICT STOP; + if (numVectors > 16) + { + STOP = pVectors + numVectors - 16; + // flush the first two blocks we'll write into + __dcbz128( 0, outData ); + __dcbz128( 128, outData ); + __dcbz128( 256, outData ); + + while (inData < STOP) + { + // start prefetching the three cache lines + // we'll hit two iterations from now + __dcbt( sizeof(FourVectors) * 16, inData ); + __dcbt( sizeof(FourVectors) * 16 + 128, inData ); + __dcbt( sizeof(FourVectors) * 16 + 256, inData ); + + // synchro + COMPUTE_STAGE1_GROUP(res0, inData + 0); + COMPUTE_STAGE1_GROUP(res1, inData + 1); + COMPUTE_STAGE1_GROUP(res2, inData + 2); + COMPUTE_STAGE1_GROUP(res3, inData + 3); + + // pre-zero the three cache lines we'll overwrite + // in the next iteration + __dcbz128( 384, outData ); + __dcbz128( 512, outData ); + __dcbz128( 640, outData ); + + + COMPUTE_STAGE2_GROUP(res0, inData + 0); + COMPUTE_STAGE1_GROUP(res4, inData + 4); + COMPUTE_STAGE2_GROUP(res1, inData + 1); + COMPUTE_STAGE1_GROUP(res5, inData + 5); + COMPUTE_STAGE2_GROUP(res2, inData + 2); + COMPUTE_STAGE1_GROUP(res6, inData + 6); + COMPUTE_STAGE2_GROUP(res3, inData + 3); + COMPUTE_STAGE1_GROUP(res7, inData + 7); + + COMPUTE_STAGE3_GROUP(res0, inData + 0); + COMPUTE_STAGE2_GROUP(res4, inData + 4); + COMPUTE_STAGE3_GROUP(res1, inData + 1); + COMPUTE_STAGE2_GROUP(res5, inData + 5); + COMPUTE_STAGE3_GROUP(res2, inData + 2); + COMPUTE_STAGE2_GROUP(res6, inData + 6); + COMPUTE_STAGE3_GROUP(res3, inData + 3); + COMPUTE_STAGE2_GROUP(res7, inData + 7); + + COMPUTE_STAGE3_GROUP(res4, inData + 4); + WRITE_GROUP( outData + 0, res0 ); + COMPUTE_STAGE3_GROUP(res5, inData + 5); + WRITE_GROUP( outData + 1, res1 ); + COMPUTE_STAGE3_GROUP(res6, inData + 6); + WRITE_GROUP( outData + 2, res2 ); + COMPUTE_STAGE3_GROUP(res7, inData + 7); + WRITE_GROUP( outData + 3, res3 ); + + + WRITE_GROUP( outData + 4, res4 ); + WRITE_GROUP( outData + 5, res5 ); + WRITE_GROUP( outData + 6, res6 ); + WRITE_GROUP( outData + 7, res7 ); + + inData += 8; + outData += 8; + } + } + else if (numVectors == 16) + { + // zero out the exactly six cache lines we will write into + __dcbz128( 0, outData ); + __dcbz128( 128, outData ); + __dcbz128( 256, outData ); + __dcbz128( 384, outData ); + __dcbz128( 512, outData ); + __dcbz128( 640, outData ); + } + else if (numVectors == 8) + { + // zero out the exactly three cache lines we will write into + __dcbz128( 0, outData ); + __dcbz128( 128, outData ); + __dcbz128( 256, outData ); + } + else + { + AssertMsg(false, "Can't happen!"); + } + + // deal with the ultimate two groups (or, if we were fed + // less than 16 groups, the whole shebang) + STOP = pVectors + numVectors - 16; + + + // Use techniques of loop scheduling to eliminate data hazards; process + // eight groups simultaneously so that we never have any operations stalling + // waiting for data. + // Note: this loop, while pretty fast, could be faster still -- you'll notice + // that it does all of its loads, then all computation, then writes everything + // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2, + // stage 3, and write, then throughput could be higher (probably by about 50%). + while (inData < STOP) + { + // synchro + COMPUTE_STAGE1_GROUP(res0, inData + 0); + COMPUTE_STAGE1_GROUP(res1, inData + 1); + COMPUTE_STAGE1_GROUP(res2, inData + 2); + COMPUTE_STAGE1_GROUP(res3, inData + 3); + + COMPUTE_STAGE2_GROUP(res0, inData + 0); + COMPUTE_STAGE1_GROUP(res4, inData + 4); + COMPUTE_STAGE2_GROUP(res1, inData + 1); + COMPUTE_STAGE1_GROUP(res5, inData + 5); + COMPUTE_STAGE2_GROUP(res2, inData + 2); + COMPUTE_STAGE1_GROUP(res6, inData + 6); + COMPUTE_STAGE2_GROUP(res3, inData + 3); + COMPUTE_STAGE1_GROUP(res7, inData + 7); + + COMPUTE_STAGE3_GROUP(res0, inData + 0); + COMPUTE_STAGE2_GROUP(res4, inData + 4); + COMPUTE_STAGE3_GROUP(res1, inData + 1); + COMPUTE_STAGE2_GROUP(res5, inData + 5); + COMPUTE_STAGE3_GROUP(res2, inData + 2); + COMPUTE_STAGE2_GROUP(res6, inData + 6); + COMPUTE_STAGE3_GROUP(res3, inData + 3); + COMPUTE_STAGE2_GROUP(res7, inData + 7); + + COMPUTE_STAGE3_GROUP(res4, inData + 4); + WRITE_GROUP( outData + 0, res0 ); + COMPUTE_STAGE3_GROUP(res5, inData + 5); + WRITE_GROUP( outData + 1, res1 ); + COMPUTE_STAGE3_GROUP(res6, inData + 6); + WRITE_GROUP( outData + 2, res2 ); + COMPUTE_STAGE3_GROUP(res7, inData + 7); + WRITE_GROUP( outData + 3, res3 ); + + + WRITE_GROUP( outData + 4, res4 ); + WRITE_GROUP( outData + 5, res5 ); + WRITE_GROUP( outData + 6, res6 ); + WRITE_GROUP( outData + 7, res7 ); + + inData += 8; + outData += 8; + } + + +#undef COMPUTE +#undef WRITE +#undef COMPUTE_STAGE1_ROW +#undef COMPUTE_STAGE2_ROW +#undef COMPUTE_STAGE3_ROW +#undef COMPUTE_STAGE1_GROUP +#undef COMPUTE_STAGE2_GROUP +#undef COMPUTE_STAGE3_GROUP +#undef COMPUTE_GROUP +#undef WRITE_GROUP +} +#endif + +// Transform a long array of FourVectors by a given matrix. +void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ) +{ + Assert(numVectors > 0); + + AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) || + (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." ); + +#ifdef _X360 + // The really fast version of this function likes to operate on blocks of eight. So, chug through + // groups of eight, then deal with any leftovers. + int numVectorsRoundedToNearestEight = numVectors & (~0x07); + if (numVectors >= 8) + { + // aligned? + if ((reinterpret_cast(pVectors) & 127) == 0 && (reinterpret_cast(pOut) & 127) == 0) + { + FourVectors_TransformManyGroupsOfEightBy_128byteAligned(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut); + } + else + { + FourVectors_TransformManyGroupsOfEightBy(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut); + } + numVectors -= numVectorsRoundedToNearestEight; + pVectors += numVectorsRoundedToNearestEight; + pOut += numVectorsRoundedToNearestEight; + } +#endif + + // any left over? + if (numVectors > 0) + { + + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS + matSplat10, matSplat11, matSplat12, matSplat13, + matSplat20, matSplat21, matSplat22, matSplat23; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the transpose row of + // the matrix. + fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); + + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); + matSplat03 = SplatWSIMD(matCol0); + + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); + matSplat13 = SplatWSIMD(matCol1); + + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); + matSplat23 = SplatWSIMD(matCol2); + } + + do + { + // Trust in the compiler to schedule these operations correctly: + pOut->x = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03))); + pOut->y = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13))); + pOut->z = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23))); + + ++pOut; + ++pVectors; + --numVectors; + } while(numVectors > 0); + } +} + +#ifdef _X360 +// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. +static void FourVectors_TransformManyGroupsOfEightBy_InPlace(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ) +{ + Assert(numVectors > 0); + if ( numVectors == 0 ) + return; + + // Prefetch line 1 and 2 + __dcbt(0,pVectors); + __dcbt(128,pVectors); + + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS + matSplat10, matSplat11, matSplat12, matSplat13, + matSplat20, matSplat21, matSplat22, matSplat23; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix. + fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); + + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); + matSplat03 = SplatWSIMD(matCol0); + + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); + matSplat13 = SplatWSIMD(matCol1); + + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); + matSplat23 = SplatWSIMD(matCol2); + } + + // this macro defines how to compute a specific row from an input and certain splat columns +#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm ) +#define WRITE(term, reg, toptr) toptr->term = reg + + // define result groups (we're going to have an eight-way unroll) + + fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS + fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp; + fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp; + fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp; + fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp; + fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp; + fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp; + fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp; + + + // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23) +#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23) +#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z + + /* + // stage 1 -- 6 ops for xyz, each w 12 cycle latency + res0X = MulSIMD( (invec)->y, matSplat01 ); + res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03); + // stage 2 -- 3 clocks for xyz + res0X = MaddSIMD( (invec)->x, matSplat00, res0X ); + // stage 3 -- 3 clocks for xyz + res0X = AddSIMD(res0X, res0Temp); + */ +#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat) +#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res ) +#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar + +#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + +#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + +#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\ + COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\ + COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23) + + const FourVectors * const RESTRICT STOP = pVectors + numVectors; + + // Use techniques of loop scheduling to eliminate data hazards; process + // eight groups simultaneously so that we never have any operations stalling + // waiting for data. + // Note: this loop, while pretty fast, could be faster still -- you'll notice + // that it does all of its loads, then all computation, then writes everything + // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2, + // stage 3, and write, then throughput could be higher (probably by about 50%). + while (pVectors < STOP) + { + // start prefetching the three cache lines + // we'll hit two iterations from now + __dcbt( sizeof(FourVectors) * 16, pVectors ); + __dcbt( sizeof(FourVectors) * 16 + 128, pVectors ); + __dcbt( sizeof(FourVectors) * 16 + 256, pVectors ); + + // synchro + COMPUTE_STAGE1_GROUP(res0, pVectors + 0); + COMPUTE_STAGE1_GROUP(res1, pVectors + 1); + COMPUTE_STAGE1_GROUP(res2, pVectors + 2); + COMPUTE_STAGE1_GROUP(res3, pVectors + 3); + + COMPUTE_STAGE2_GROUP(res0, pVectors + 0); + COMPUTE_STAGE1_GROUP(res4, pVectors + 4); + COMPUTE_STAGE2_GROUP(res1, pVectors + 1); + COMPUTE_STAGE1_GROUP(res5, pVectors + 5); + COMPUTE_STAGE2_GROUP(res2, pVectors + 2); + COMPUTE_STAGE1_GROUP(res6, pVectors + 6); + COMPUTE_STAGE2_GROUP(res3, pVectors + 3); + COMPUTE_STAGE1_GROUP(res7, pVectors + 7); + + COMPUTE_STAGE3_GROUP(res0, pVectors + 0); + COMPUTE_STAGE2_GROUP(res4, pVectors + 4); + COMPUTE_STAGE3_GROUP(res1, pVectors + 1); + COMPUTE_STAGE2_GROUP(res5, pVectors + 5); + COMPUTE_STAGE3_GROUP(res2, pVectors + 2); + COMPUTE_STAGE2_GROUP(res6, pVectors + 6); + COMPUTE_STAGE3_GROUP(res3, pVectors + 3); + COMPUTE_STAGE2_GROUP(res7, pVectors + 7); + + COMPUTE_STAGE3_GROUP(res4, pVectors + 4); + WRITE_GROUP( pVectors + 0, res0 ); + COMPUTE_STAGE3_GROUP(res5, pVectors + 5); + WRITE_GROUP( pVectors + 1, res1 ); + COMPUTE_STAGE3_GROUP(res6, pVectors + 6); + WRITE_GROUP( pVectors + 2, res2 ); + COMPUTE_STAGE3_GROUP(res7, pVectors + 7); + WRITE_GROUP( pVectors + 3, res3 ); + + WRITE_GROUP( pVectors + 4, res4 ); + WRITE_GROUP( pVectors + 5, res5 ); + WRITE_GROUP( pVectors + 6, res6 ); + WRITE_GROUP( pVectors + 7, res7 ); + + pVectors += 8; + } + + +#undef COMPUTE +#undef WRITE +#undef COMPUTE_STAGE1_ROW +#undef COMPUTE_STAGE2_ROW +#undef COMPUTE_STAGE3_ROW +#undef COMPUTE_STAGE1_GROUP +#undef COMPUTE_STAGE2_GROUP +#undef COMPUTE_STAGE3_GROUP +#undef COMPUTE_GROUP +#undef WRITE_GROUP +} +#endif + +// In-place version of above. It's necessary to have this, rather than just allowing pOut and pVectors +// to equal each other, because of the semantics of RESTRICT: pVectors and pOut must not be allowed +// to alias. (Simply un-restricting the pointers results in very poor scheduling.) +void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ) +{ + Assert(numVectors > 0); + +#ifdef _X360 + // The really fast version of this function likes to operate on blocks of eight. So, chug through + // groups of eight, then deal with any leftovers. + int numVectorsRoundedToNearestEight = numVectors & (~0x07); + if (numVectors >= 8) + { + FourVectors_TransformManyGroupsOfEightBy_InPlace(pVectors, numVectorsRoundedToNearestEight, rotationMatrix); + numVectors -= numVectorsRoundedToNearestEight; + pVectors += numVectorsRoundedToNearestEight; + } +#endif + + // any left over? + if (numVectors > 0) + { + + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS + matSplat10, matSplat11, matSplat12, matSplat13, + matSplat20, matSplat21, matSplat22, matSplat23; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the transpose row of + // the matrix. + fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]); + fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]); + fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]); + + matSplat00 = SplatXSIMD(matCol0); + matSplat01 = SplatYSIMD(matCol0); + matSplat02 = SplatZSIMD(matCol0); + matSplat03 = SplatWSIMD(matCol0); + + matSplat10 = SplatXSIMD(matCol1); + matSplat11 = SplatYSIMD(matCol1); + matSplat12 = SplatZSIMD(matCol1); + matSplat13 = SplatWSIMD(matCol1); + + matSplat20 = SplatXSIMD(matCol2); + matSplat21 = SplatYSIMD(matCol2); + matSplat22 = SplatZSIMD(matCol2); + matSplat23 = SplatWSIMD(matCol2); + } + + do + { + fltx4 resultX, resultY, resultZ; + // Trust in the compiler to schedule these operations correctly: + resultX = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03))); + resultY = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13))); + resultZ = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23))); + + pVectors->x = resultX; + pVectors->y = resultY; + pVectors->z = resultZ; + + ++pVectors; + --numVectors; + } while(numVectors > 0); + } +} + + +#endif + +// Transform many (horizontal) points in-place by a 3x4 matrix, +// here already loaded onto three fltx4 registers but not transposed. +// The points must be stored as 16-byte aligned. They are points +// and not vectors because we assume the w-component to be 1. +#ifdef _X360 +void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow0, FLTX4 mRow1, FLTX4 mRow2) +{ + /************************************************** + * Here is an elaborate and carefully scheduled * + * algorithm nicked from xboxmath.inl and hacked * + * up for 3x4 matrices. * + **************************************************/ + + COMPILE_TIME_ASSERT(sizeof(VectorAligned) == sizeof(XMFLOAT4)); // VectorAligned's need to be 16 bytes + + XMVECTOR R0[8], R1[8], R2[8]; + XMVECTOR vIn[8]; + + // C_ASSERT(UnrollCount == 8); + // C_ASSERT(sizeof(XMFLOAT4) == 16); + Assert(pVectors); + Assert(((UINT_PTR)pVectors & 3) == 0); // assert alignment + + UINT GroupIndex; + + VectorAligned * RESTRICT vCurrent = pVectors; + // sentinel pointers + VectorAligned * vStreamEnd, *vStreamGroupBase, *vStreamGroupEnd; + + { + // cook up the pointers from integer math. Necessary because otherwise we LHS all over + // the place. (Odd that this doesn't happen to the xbox math.) + + UINT_PTR InputVector = (UINT_PTR)pVectors; + UINT_PTR InputStreamEnd = InputVector + numVectors * sizeof(XMFLOAT4); + // compute start and end points on 128-byte alignment + UINT_PTR InputStreamCGroupBase = XMMin(InputVector + (XM_CACHE_LINE_SIZE - 1), InputStreamEnd) & ~(XM_CACHE_LINE_SIZE - 1); + UINT_PTR InputStreamCGroupEnd = InputStreamCGroupBase + ((InputStreamEnd - InputStreamCGroupBase) & ~(4 * XM_CACHE_LINE_SIZE - 1)); + + vStreamEnd = (VectorAligned *)InputStreamEnd; + vStreamGroupBase = (VectorAligned *)InputStreamCGroupBase; + vStreamGroupEnd = (VectorAligned *)InputStreamCGroupEnd; + } + + + __dcbt(0, vStreamGroupBase); + __dcbt(XM_CACHE_LINE_SIZE, vStreamGroupBase); + __dcbt(XM_CACHE_LINE_SIZE * 2, vStreamGroupBase); + __dcbt(XM_CACHE_LINE_SIZE * 3, vStreamGroupBase); + + while (vCurrent < vStreamGroupBase) + { + fltx4 vec = __lvx(vCurrent->Base(), 0); + + R0[0] = __vmsum4fp(vec, mRow0); + R1[0] = __vmsum4fp(vec, mRow1); + R2[0] = __vmsum4fp(vec, mRow2); + + __stvewx(R0[0], vCurrent->Base(), 0); + __stvewx(R1[0], vCurrent->Base(), 4); + __stvewx(R2[0], vCurrent->Base(), 8); + + vCurrent++; + } + + while (vCurrent < vStreamGroupEnd) + { + __dcbt(XM_CACHE_LINE_SIZE * 4, vCurrent); + __dcbt(XM_CACHE_LINE_SIZE * 5, vCurrent); + __dcbt(XM_CACHE_LINE_SIZE * 6, vCurrent); + __dcbt(XM_CACHE_LINE_SIZE * 7, vCurrent); + + for (GroupIndex = 0; GroupIndex < 4; GroupIndex++) + { + // all kinds of LHS on this pointer. Why? + VectorAligned* OutputVector = vCurrent; + + vIn[0] = __lvx(vCurrent->Base(), 0); + vCurrent++; + vIn[1] = __lvx(vCurrent->Base(), 0); + vCurrent++; + vIn[2] = __lvx(vCurrent->Base(), 0); + vCurrent++; + vIn[3] = __lvx(vCurrent->Base(), 0); + vCurrent++; + vIn[4] = __lvx(vCurrent->Base(), 0); + vCurrent++; + vIn[5] = __lvx(vCurrent->Base(), 0); + vCurrent++; + vIn[6] = __lvx(vCurrent->Base(), 0); + vCurrent++; + vIn[7] = __lvx(vCurrent->Base(), 0); + vCurrent++; + + R0[0] = __vmsum4fp(vIn[0], mRow0); + R1[0] = __vmsum4fp(vIn[0], mRow1); + R2[0] = __vmsum4fp(vIn[0], mRow2); + + R0[1] = __vmsum4fp(vIn[1], mRow0); + R1[1] = __vmsum4fp(vIn[1], mRow1); + R2[1] = __vmsum4fp(vIn[1], mRow2); + + R0[2] = __vmsum4fp(vIn[2], mRow0); + R1[2] = __vmsum4fp(vIn[2], mRow1); + R2[2] = __vmsum4fp(vIn[2], mRow2); + + R0[3] = __vmsum4fp(vIn[3], mRow0); + R1[3] = __vmsum4fp(vIn[3], mRow1); + R2[3] = __vmsum4fp(vIn[3], mRow2); + + R0[4] = __vmsum4fp(vIn[4], mRow0); + R1[4] = __vmsum4fp(vIn[4], mRow1); + R2[4] = __vmsum4fp(vIn[4], mRow2); + + R0[5] = __vmsum4fp(vIn[5], mRow0); + R1[5] = __vmsum4fp(vIn[5], mRow1); + R2[5] = __vmsum4fp(vIn[5], mRow2); + + R0[6] = __vmsum4fp(vIn[6], mRow0); + R1[6] = __vmsum4fp(vIn[6], mRow1); + R2[6] = __vmsum4fp(vIn[6], mRow2); + + R0[7] = __vmsum4fp(vIn[7], mRow0); + R1[7] = __vmsum4fp(vIn[7], mRow1); + R2[7] = __vmsum4fp(vIn[7], mRow2); + + __stvewx(R0[0], OutputVector, 0); + __stvewx(R1[0], OutputVector, 4); + __stvewx(R2[0], OutputVector, 8); + OutputVector++; + + __stvewx(R0[1], OutputVector, 0); + __stvewx(R1[1], OutputVector, 4); + __stvewx(R2[1], OutputVector, 8); + OutputVector++; + + __stvewx(R0[2], OutputVector, 0); + __stvewx(R1[2], OutputVector, 4); + __stvewx(R2[2], OutputVector, 8); + OutputVector++; + + __stvewx(R0[3], OutputVector, 0); + __stvewx(R1[3], OutputVector, 4); + __stvewx(R2[3], OutputVector, 8); + OutputVector++; + + __stvewx(R0[4], OutputVector, 0); + __stvewx(R1[4], OutputVector, 4); + __stvewx(R2[4], OutputVector, 8); + OutputVector++; + + __stvewx(R0[5], OutputVector, 0); + __stvewx(R1[5], OutputVector, 4); + __stvewx(R2[5], OutputVector, 8); + OutputVector++; + + __stvewx(R0[6], OutputVector, 0); + __stvewx(R1[6], OutputVector, 4); + __stvewx(R2[6], OutputVector, 8); + OutputVector++; + + __stvewx(R0[7], OutputVector, 0); + __stvewx(R1[7], OutputVector, 4); + __stvewx(R2[7], OutputVector, 8); + OutputVector++; + } + } + + while (vCurrent < vStreamEnd) + { + vIn[0] = __lvx(vCurrent->Base(), 0); + + R0[0] = __vmsum4fp(vIn[0], mRow0); + R1[0] = __vmsum4fp(vIn[0], mRow1); + R2[0] = __vmsum4fp(vIn[0], mRow2); + + __stvewx(R0[0], vCurrent->Base(), 0); + __stvewx(R1[0], vCurrent->Base(), 4); + __stvewx(R2[0], vCurrent->Base(), 8); + + vCurrent++; + } + + +} +#endif -- cgit v1.2.3