diff options
| author | Jørgen P. Tjernø <[email protected]> | 2013-12-02 19:31:46 -0800 |
|---|---|---|
| committer | Jørgen P. Tjernø <[email protected]> | 2013-12-02 19:46:31 -0800 |
| commit | f56bb35301836e56582a575a75864392a0177875 (patch) | |
| tree | de61ddd39de3e7df52759711950b4c288592f0dc /mp/src/public/mathlib/ssemath.h | |
| parent | Mark some more files as text. (diff) | |
| download | source-sdk-2013-f56bb35301836e56582a575a75864392a0177875.tar.xz source-sdk-2013-f56bb35301836e56582a575a75864392a0177875.zip | |
Fix line endings. WHAMMY.
Diffstat (limited to 'mp/src/public/mathlib/ssemath.h')
| -rw-r--r-- | mp/src/public/mathlib/ssemath.h | 6196 |
1 files changed, 3098 insertions, 3098 deletions
diff --git a/mp/src/public/mathlib/ssemath.h b/mp/src/public/mathlib/ssemath.h index b25fbd09..6691df12 100644 --- a/mp/src/public/mathlib/ssemath.h +++ b/mp/src/public/mathlib/ssemath.h @@ -1,3098 +1,3098 @@ -//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: - defines SIMD "structure of arrays" classes and functions.
-//
-//===========================================================================//
-#ifndef SSEMATH_H
-#define SSEMATH_H
-
-#if defined( _X360 )
-#include <xboxmath.h>
-#else
-#include <xmmintrin.h>
-#endif
-
-#include <mathlib/vector.h>
-#include <mathlib/mathlib.h>
-
-#if defined(GNUC)
-#define USE_STDC_FOR_SIMD 0
-#else
-#define USE_STDC_FOR_SIMD 0
-#endif
-
-#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0))
-#define _SSE1 1
-#endif
-
-// I thought about defining a class/union for the SIMD packed floats instead of using fltx4,
-// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur
-// the relationship between packed floats and packed integer types and (b) not sure that the
-// compiler would handle generating good code for the intrinsics.
-
-#if USE_STDC_FOR_SIMD
-
-typedef union
-{
- float m128_f32[4];
- uint32 m128_u32[4];
-} fltx4;
-
-typedef fltx4 i32x4;
-typedef fltx4 u32x4;
-
-#elif ( defined( _X360 ) )
-
-typedef union
-{
- // This union allows float/int access (which generally shouldn't be done in inner loops)
- __vector4 vmx;
- float m128_f32[4];
- uint32 m128_u32[4];
-} fltx4_union;
-
-typedef __vector4 fltx4;
-typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops.
-typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops.
-
-#else
-
-typedef __m128 fltx4;
-typedef __m128 i32x4;
-typedef __m128 u32x4;
-
-#endif
-
-// The FLTX4 type is a fltx4 used as a parameter to a function.
-// On the 360, the best way to do this is pass-by-copy on the registers.
-// On the PC, the best way is to pass by const reference.
-// The compiler will sometimes, but not always, replace a pass-by-const-ref
-// with a pass-in-reg on the 360; to avoid this confusion, you can
-// explicitly use a FLTX4 as the parameter type.
-#ifdef _X360
-typedef __vector4 FLTX4;
-#else
-typedef const fltx4 & FLTX4;
-#endif
-
-// A 16-byte aligned int32 datastructure
-// (for use when writing out fltx4's as SIGNED
-// ints).
-struct ALIGN16 intx4
-{
- int32 m_i32[4];
-
- inline int & operator[](int which)
- {
- return m_i32[which];
- }
-
- inline const int & operator[](int which) const
- {
- return m_i32[which];
- }
-
- inline int32 *Base() {
- return m_i32;
- }
-
- inline const int32 *Base() const
- {
- return m_i32;
- }
-
- inline const bool operator==(const intx4 &other) const
- {
- return m_i32[0] == other.m_i32[0] &&
- m_i32[1] == other.m_i32[1] &&
- m_i32[2] == other.m_i32[2] &&
- m_i32[3] == other.m_i32[3] ;
- }
-} ALIGN16_POST;
-
-
-#if defined( _DEBUG ) && defined( _X360 )
-FORCEINLINE void TestVPUFlags()
-{
- // Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com)
- __vector4 a;
- __asm
- {
- mfvscr a;
- }
- unsigned int * flags = (unsigned int *)&a;
- unsigned int controlWord = flags[3];
- Assert(controlWord == 0);
-}
-#else // _DEBUG
-FORCEINLINE void TestVPUFlags() {}
-#endif // _DEBUG
-
-
-// useful constants in SIMD packed float format:
-// (note: some of these aren't stored on the 360,
-// but are manufactured directly in one or two
-// instructions, saving a load and possible L2
-// miss.)
-#ifndef _X360
-extern const fltx4 Four_Zeros; // 0 0 0 0
-extern const fltx4 Four_Ones; // 1 1 1 1
-extern const fltx4 Four_Twos; // 2 2 2 2
-extern const fltx4 Four_Threes; // 3 3 3 3
-extern const fltx4 Four_Fours; // guess.
-extern const fltx4 Four_Point225s; // .225 .225 .225 .225
-extern const fltx4 Four_PointFives; // .5 .5 .5 .5
-extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
-extern const fltx4 Four_2ToThe21s; // (1<<21)..
-extern const fltx4 Four_2ToThe22s; // (1<<22)..
-extern const fltx4 Four_2ToThe23s; // (1<<23)..
-extern const fltx4 Four_2ToThe24s; // (1<<24)..
-extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2)
-extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1
-#else
-#define Four_Zeros XMVectorZero() // 0 0 0 0
-#define Four_Ones XMVectorSplatOne() // 1 1 1 1
-extern const fltx4 Four_Twos; // 2 2 2 2
-extern const fltx4 Four_Threes; // 3 3 3 3
-extern const fltx4 Four_Fours; // guess.
-extern const fltx4 Four_Point225s; // .225 .225 .225 .225
-extern const fltx4 Four_PointFives; // .5 .5 .5 .5
-extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
-extern const fltx4 Four_2ToThe21s; // (1<<21)..
-extern const fltx4 Four_2ToThe22s; // (1<<22)..
-extern const fltx4 Four_2ToThe23s; // (1<<23)..
-extern const fltx4 Four_2ToThe24s; // (1<<24)..
-extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2)
-extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1
-#endif
-extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
-extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
-extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float
-
-// external aligned integer constants
-extern const ALIGN16 int32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4
-extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4
-extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4
-extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0
-extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
-extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0
-extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4
-
-// this mask is used for skipping the tail of things. If you have N elements in an array, and wish
-// to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
-extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
-
-// Define prefetch macros.
-// The characteristics of cache and prefetch are completely
-// different between the different platforms, so you DO NOT
-// want to just define one macro that maps to every platform
-// intrinsic under the hood -- you need to prefetch at different
-// intervals between x86 and PPC, for example, and that is
-// a higher level code change.
-// On the other hand, I'm tired of typing #ifdef _X360
-// all over the place, so this is just a nop on Intel, PS3.
-#ifdef _X360
-#define PREFETCH360(address, offset) __dcbt(offset,address)
-#else
-#define PREFETCH360(x,y) // nothing
-#endif
-
-#if USE_STDC_FOR_SIMD
-
-//---------------------------------------------------------------------
-// Standard C (fallback/Linux) implementation (only there for compat - slow)
-//---------------------------------------------------------------------
-
-FORCEINLINE float SubFloat( const fltx4 & a, int idx )
-{
- return a.m128_f32[ idx ];
-}
-
-FORCEINLINE float & SubFloat( fltx4 & a, int idx )
-{
- return a.m128_f32[idx];
-}
-
-FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
-{
- return a.m128_u32[idx];
-}
-
-FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
-{
- return a.m128_u32[idx];
-}
-
-// Return one in the fastest way -- on the x360, faster even than loading.
-FORCEINLINE fltx4 LoadZeroSIMD( void )
-{
- return Four_Zeros;
-}
-
-// Return one in the fastest way -- on the x360, faster even than loading.
-FORCEINLINE fltx4 LoadOneSIMD( void )
-{
- return Four_Ones;
-}
-
-FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = SubFloat( a, 0 );
- SubFloat( retVal, 1 ) = SubFloat( a, 0 );
- SubFloat( retVal, 2 ) = SubFloat( a, 0 );
- SubFloat( retVal, 3 ) = SubFloat( a, 0 );
- return retVal;
-}
-
-FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = SubFloat( a, 1 );
- SubFloat( retVal, 1 ) = SubFloat( a, 1 );
- SubFloat( retVal, 2 ) = SubFloat( a, 1 );
- SubFloat( retVal, 3 ) = SubFloat( a, 1 );
- return retVal;
-}
-
-FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = SubFloat( a, 2 );
- SubFloat( retVal, 1 ) = SubFloat( a, 2 );
- SubFloat( retVal, 2 ) = SubFloat( a, 2 );
- SubFloat( retVal, 3 ) = SubFloat( a, 2 );
- return retVal;
-}
-
-FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = SubFloat( a, 3 );
- SubFloat( retVal, 1 ) = SubFloat( a, 3 );
- SubFloat( retVal, 2 ) = SubFloat( a, 3 );
- SubFloat( retVal, 3 ) = SubFloat( a, 3 );
- return retVal;
-}
-
-FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
-{
- fltx4 result = a;
- SubFloat( result, 0 ) = SubFloat( x, 0 );
- return result;
-}
-
-FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
-{
- fltx4 result = a;
- SubFloat( result, 1 ) = SubFloat( y, 1 );
- return result;
-}
-
-FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
-{
- fltx4 result = a;
- SubFloat( result, 2 ) = SubFloat( z, 2 );
- return result;
-}
-
-FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
-{
- fltx4 result = a;
- SubFloat( result, 3 ) = SubFloat( w, 3 );
- return result;
-}
-
-FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
-{
- fltx4 result = a;
- SubFloat( result, nComponent ) = flValue;
- return result;
-}
-
-// a b c d -> b c d a
-FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = SubFloat( a, 1 );
- SubFloat( retVal, 1 ) = SubFloat( a, 2 );
- SubFloat( retVal, 2 ) = SubFloat( a, 3 );
- SubFloat( retVal, 3 ) = SubFloat( a, 0 );
- return retVal;
-}
-
-// a b c d -> c d a b
-FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = SubFloat( a, 2 );
- SubFloat( retVal, 1 ) = SubFloat( a, 3 );
- SubFloat( retVal, 2 ) = SubFloat( a, 0 );
- SubFloat( retVal, 3 ) = SubFloat( a, 1 );
- return retVal;
-}
-
-#define BINOP(op) \
- fltx4 retVal; \
- SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \
- SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \
- SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \
- SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \
- return retVal;
-
-#define IBINOP(op) \
- fltx4 retVal; \
- SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \
- SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \
- SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \
- SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \
- return retVal;
-
-FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
-{
- BINOP(+);
-}
-
-FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
-{
- BINOP(-);
-};
-
-FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
-{
- BINOP(*);
-}
-
-FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
-{
- BINOP(/);
-}
-
-
-FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
-{
- return AddSIMD( MulSIMD(a,b), c );
-}
-
-FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
-{
- return SubSIMD( c, MulSIMD(a,b) );
-};
-
-
-FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
-{
- fltx4 result;
- SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
- SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
- SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
- SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
- return result;
-}
-
-FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
-{
- SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
- SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
- SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
-}
-
-FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
-{
- SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
- SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
- SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
- SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
-}
-
-FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
-{
- fltx4 result;
- SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
- SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
- SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
- SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
- return result;
-}
-
-FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
-{
- fltx4 result;
- SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
- SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
- SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
- SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
- return result;
-}
-
-// tan^1(a/b) .. ie, pass sin in as a and cos in as b
-FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
-{
- fltx4 result;
- SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
- SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
- SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
- SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
- return result;
-}
-
-FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
- SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
- SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
- SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
- return retVal;
-}
-
-FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
- SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
- SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
- SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
- return retVal;
-}
-
-FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
-{
- IBINOP(&);
-}
-
-FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 );
- SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 );
- SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 );
- SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 );
- return retVal;
-}
-
-FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
-{
- IBINOP(^);
-}
-
-FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
-{
- IBINOP(|);
-}
-
-FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
-{
- fltx4 retval;
- SubFloat( retval, 0 ) = -SubFloat( a, 0 );
- SubFloat( retval, 1 ) = -SubFloat( a, 1 );
- SubFloat( retval, 2 ) = -SubFloat( a, 2 );
- SubFloat( retval, 3 ) = -SubFloat( a, 3 );
-
- return retval;
-}
-
-FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
-{
- return ( SubFloat( a, 0 ) == 0.0 ) &&
- ( SubFloat( a, 1 ) == 0.0 ) &&
- ( SubFloat( a, 2 ) == 0.0 ) &&
- ( SubFloat( a, 3 ) == 0.0 ) ;
-}
-
-
-// for branching when a.xyzw > b.xyzw
-FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
-{
- return SubFloat(a,0) > SubFloat(b,0) &&
- SubFloat(a,1) > SubFloat(b,1) &&
- SubFloat(a,2) > SubFloat(b,2) &&
- SubFloat(a,3) > SubFloat(b,3);
-}
-
-// for branching when a.xyzw >= b.xyzw
-FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
-{
- return SubFloat(a,0) >= SubFloat(b,0) &&
- SubFloat(a,1) >= SubFloat(b,1) &&
- SubFloat(a,2) >= SubFloat(b,2) &&
- SubFloat(a,3) >= SubFloat(b,3);
-}
-
-// For branching if all a.xyzw == b.xyzw
-FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
-{
- return SubFloat(a,0) == SubFloat(b,0) &&
- SubFloat(a,1) == SubFloat(b,1) &&
- SubFloat(a,2) == SubFloat(b,2) &&
- SubFloat(a,3) == SubFloat(b,3);
-}
-
-FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
-{
- int nRet = 0;
-
- nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0
- nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1
- nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2
- nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3
-
- return nRet;
-}
-
-FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
-{
- return (0 != TestSignSIMD( a ));
-}
-
-FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0;
- SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0;
- SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0;
- SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0;
- return retVal;
-}
-
-FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0;
- SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0;
- SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0;
- SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0;
- return retVal;
-}
-
-FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0;
- SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0;
- SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0;
- SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0;
- return retVal;
-}
-
-FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0;
- SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0;
- SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0;
- SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0;
- return retVal;
-}
-
-FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0;
- SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0;
- SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0;
- SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0;
- return retVal;
-}
-
-FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0;
- SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0;
- SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0;
- SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0;
- return retVal;
-}
-
-
-FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
-{
- return OrSIMD(
- AndSIMD( ReplacementMask, NewValue ),
- AndNotSIMD( ReplacementMask, OldValue ) );
-}
-
-FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = flValue;
- SubFloat( retVal, 1 ) = flValue;
- SubFloat( retVal, 2 ) = flValue;
- SubFloat( retVal, 3 ) = flValue;
- return retVal;
-}
-
-/// replicate a single 32 bit integer value to all 4 components of an m128
-FORCEINLINE fltx4 ReplicateIX4( int nValue )
-{
- fltx4 retVal;
- SubInt( retVal, 0 ) = nValue;
- SubInt( retVal, 1 ) = nValue;
- SubInt( retVal, 2 ) = nValue;
- SubInt( retVal, 3 ) = nValue;
- return retVal;
-
-}
-
-// Round towards positive infinity
-FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
- SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
- SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
- SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
- return retVal;
-
-}
-
-// Round towards negative infinity
-FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
- SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
- SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
- SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
- return retVal;
-
-}
-
-FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
- SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
- SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
- SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
- return retVal;
-}
-
-FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
- SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
- SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
- SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
- return retVal;
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
- SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
- SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
- SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
- return retVal;
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
- SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
- SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
- SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
- return retVal;
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
- SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
- SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
- SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
- return retVal;
-}
-
-FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
- SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
- SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
- SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
- return retVal;
-}
-
-FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
- SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
- SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
- SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
- return retVal;
-}
-
-/// 1/x for all 4 values.
-/// 1/0 will result in a big but NOT infinite result
-FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
- SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
- SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
- SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
- return retVal;
-}
-
-FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
- SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
- SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
- SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
- return retVal;
-}
-
-// 2^x for all values (the antilog)
-FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) );
- SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) );
- SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) );
- SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) );
-
- return retVal;
-}
-
-FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
-{
- float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
- SubFloat( a, 1 ) * SubFloat( b, 1 ) +
- SubFloat( a, 2 ) * SubFloat( b, 2 );
- return ReplicateX4( flDot );
-}
-
-FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
-{
- float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
- SubFloat( a, 1 ) * SubFloat( b, 1 ) +
- SubFloat( a, 2 ) * SubFloat( b, 2 ) +
- SubFloat( a, 3 ) * SubFloat( b, 3 );
- return ReplicateX4( flDot );
-}
-
-// Clamps the components of a vector to a specified minimum and maximum range.
-FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
-{
- return MaxSIMD( min, MinSIMD( max, in ) );
-}
-
-// Squelch the w component of a vector to +0.0.
-// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
-FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
-{
- fltx4 retval;
- retval = a;
- SubFloat( retval, 0 ) = 0;
- return retval;
-}
-
-FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
-{
- return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
-}
-
-FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
-{
- return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
-}
-
-FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
-{
- return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
-}
-
-// for the transitional class -- load a 3-by VectorAligned and squash its w component
-FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
-{
- fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
- // squelch w
- SubInt( retval, 3 ) = 0;
- return retval;
-}
-
-FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
-}
-
-FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
-}
-
-FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
-{
- *pSIMD = SubFloat(a, 0);
- *(pSIMD+1) = SubFloat(a, 1);
- *(pSIMD+2) = SubFloat(a, 2);
-}
-
-// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
-FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
-{
- StoreAlignedSIMD(pSIMD->Base(),a);
-}
-
-FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
-{
-#define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; }
- SWAP_FLOATS( x, 1, y, 0 );
- SWAP_FLOATS( x, 2, z, 0 );
- SWAP_FLOATS( x, 3, w, 0 );
- SWAP_FLOATS( y, 2, z, 1 );
- SWAP_FLOATS( y, 3, w, 1 );
- SWAP_FLOATS( z, 3, w, 2 );
-}
-
-// find the lowest component of a.x, a.y, a.z,
-// and replicate it to the whole return value.
-FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
-{
- float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
- return ReplicateX4(lowest);
-}
-
-// find the highest component of a.x, a.y, a.z,
-// and replicate it to the whole return value.
-FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
-{
- float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
- return ReplicateX4(highest);
-}
-
-// Fixed-point conversion and save as SIGNED INTS.
-// pDest->x = Int (vSrc.x)
-// note: some architectures have means of doing
-// fixed point conversion when the fix depth is
-// specified as an immediate.. but there is no way
-// to guarantee an immediate as a parameter to function
-// like this.
-FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
-{
- (*pDest)[0] = SubFloat(vSrc, 0);
- (*pDest)[1] = SubFloat(vSrc, 1);
- (*pDest)[2] = SubFloat(vSrc, 2);
- (*pDest)[3] = SubFloat(vSrc, 3);
-}
-
-// ------------------------------------
-// INTEGER SIMD OPERATIONS.
-// ------------------------------------
-// splat all components of a vector to a signed immediate int number.
-FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue )
-{
- fltx4 retval;
- SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue;
- return retval;
-}
-
-// Load 4 aligned words into a SIMD register
-FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD)
-{
- return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
-}
-
-// Load 4 unaligned words into a SIMD register
-FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
-{
- return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
-}
-
-// save into four words, 16-byte aligned
-FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
-}
-
-FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
-}
-
-FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
-}
-
-// Take a fltx4 containing fixed-point uints and
-// return them as single precision floats. No
-// fixed point conversion is done.
-FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
-{
- Assert(0); /* pc has no such operation */
- fltx4 retval;
- SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
- SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
- SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
- SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
- return retval;
-}
-
-
-#if 0 /* pc has no such op */
-// Take a fltx4 containing fixed-point sints and
-// return them as single precision floats. No
-// fixed point conversion is done.
-FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
-{
- fltx4 retval;
- SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
- SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
- SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
- SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
- return retval;
-}
-
-
-/*
- works on fltx4's as if they are four uints.
- the first parameter contains the words to be shifted,
- the second contains the amount to shift by AS INTS
-
- for i = 0 to 3
- shift = vSrcB_i*32:(i*32)+4
- vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
-*/
-FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
-{
- i32x4 retval;
- SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
- SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
- SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
- SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
-
-
- return retval;
-}
-#endif
-
-#elif ( defined( _X360 ) )
-
-//---------------------------------------------------------------------
-// X360 implementation
-//---------------------------------------------------------------------
-
-FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )
-{
- fltx4_union & a_union = (fltx4_union &)a;
- return a_union.m128_f32[idx];
-}
-
-FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )
-{
- fltx4_union & a_union = (fltx4_union &)a;
- return a_union.m128_u32[idx];
-}
-
-FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
-{
- return __vaddfp( a, b );
-}
-
-FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
-{
- return __vsubfp( a, b );
-}
-
-FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
-{
- return __vmulfp( a, b );
-}
-
-FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
-{
- return __vmaddfp( a, b, c );
-}
-
-FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
-{
- return __vnmsubfp( a, b, c );
-};
-
-FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
-{
- return __vmsum3fp( a, b );
-}
-
-FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
-{
- return __vmsum4fp( a, b );
-}
-
-FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
-{
- return XMVectorSin( radians );
-}
-
-FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
-{
- XMVectorSinCos( &sine, &cosine, radians );
-}
-
-FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
-{
- XMVectorSinCos( &sine, &cosine, radians );
-}
-
-FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )
-{
- cosine = XMVectorCos( radians );
-}
-
-FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
-{
- return XMVectorASin( sine );
-}
-
-FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
-{
- return XMVectorACos( cs );
-}
-
-// tan^1(a/b) .. ie, pass sin in as a and cos in as b
-FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
-{
- return XMVectorATan2( a, b );
-}
-
-// DivSIMD defined further down, since it uses ReciprocalSIMD
-
-FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
-{
- return __vmaxfp( a, b );
-}
-
-FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
-{
- return __vminfp( a, b );
-}
-
-FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
-{
- return __vand( a, b );
-}
-
-FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
-{
- // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
- return __vandc( b, a );
-}
-
-FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
-{
- return __vxor( a, b );
-}
-
-FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
-{
- return __vor( a, b );
-}
-
-FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
-{
- return XMVectorNegate(a);
-}
-
-FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
-{
- unsigned int equalFlags = 0;
- __vcmpeqfpR( a, Four_Zeros, &equalFlags );
- return XMComparisonAllTrue( equalFlags );
-}
-
-FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero?
-{
- unsigned int conditionregister;
- XMVectorEqualR(&conditionregister, a, XMVectorZero());
- return XMComparisonAnyTrue(conditionregister);
-}
-
-FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero?
-{
- // copy a's x component into w, in case w was zero.
- fltx4 temp = __vrlimi(a, a, 1, 1);
- unsigned int conditionregister;
- XMVectorEqualR(&conditionregister, temp, XMVectorZero());
- return XMComparisonAnyTrue(conditionregister);
-}
-
-// for branching when a.xyzw > b.xyzw
-FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
-{
- unsigned int cr;
- XMVectorGreaterR(&cr,a,b);
- return XMComparisonAllTrue(cr);
-}
-
-// for branching when a.xyzw >= b.xyzw
-FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
-{
- unsigned int cr;
- XMVectorGreaterOrEqualR(&cr,a,b);
- return XMComparisonAllTrue(cr);
-}
-
-// For branching if all a.xyzw == b.xyzw
-FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
-{
- unsigned int cr;
- XMVectorEqualR(&cr,a,b);
- return XMComparisonAllTrue(cr);
-}
-
-
-FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
-{
- // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
- int nRet = 0;
-
- const fltx4_union & a_union = (const fltx4_union &)a;
- nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
- nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
- nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
- nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
-
- return nRet;
-}
-
-// Squelch the w component of a vector to +0.0.
-// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
-FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
-{
- return __vrlimi( a, __vzero(), 1, 0 );
-}
-
-FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
-{
- // NOTE: this tests the top bits of each vector element using integer math
- // (so it ignores NaNs - it will return true for "-NaN")
- unsigned int equalFlags = 0;
- fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
- signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000
- __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );
- return !XMComparisonAllTrue( equalFlags );
-}
-
-FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
-{
- return __vcmpeqfp( a, b );
-}
-
-
-FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
-{
- return __vcmpgtfp( a, b );
-}
-
-FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
-{
- return __vcmpgefp( a, b );
-}
-
-FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
-{
- return __vcmpgtfp( b, a );
-}
-
-FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
-{
- return __vcmpgefp( b, a );
-}
-
-FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
-{
- return XMVectorInBounds( a, b );
-}
-
-// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
-FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
-{
- return __vsel( OldValue, NewValue, ReplacementMask );
-}
-
-// AKA "Broadcast", "Splat"
-FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
-{
- // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
- float * pValue = &flValue;
- Assert( pValue );
- Assert( ((unsigned int)pValue & 3) == 0);
- return __vspltw( __lvlx( pValue, 0 ), 0 );
-}
-
-FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a
-{
- Assert( pValue );
- return __vspltw( __lvlx( pValue, 0 ), 0 );
-}
-
-/// replicate a single 32 bit integer value to all 4 components of an m128
-FORCEINLINE fltx4 ReplicateIX4( int nValue )
-{
- // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
- int * pValue = &nValue;
- Assert( pValue );
- Assert( ((unsigned int)pValue & 3) == 0);
- return __vspltw( __lvlx( pValue, 0 ), 0 );
-}
-
-// Round towards positive infinity
-FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
-{
- return __vrfip(a);
-}
-
-// Round towards nearest integer
-FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )
-{
- return __vrfin(a);
-}
-
-// Round towards negative infinity
-FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
-{
- return __vrfim(a);
-}
-
-FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
-{
- // This is emulated from rsqrt
- return XMVectorSqrtEst( a );
-}
-
-FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
-{
- // This is emulated from rsqrt
- return XMVectorSqrt( a );
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
-{
- return __vrsqrtefp( a );
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
-{
- // Convert zeros to epsilons
- fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
- fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
- return ReciprocalSqrtEstSIMD( a_safe );
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
-{
- // This uses Newton-Raphson to improve the HW result
- return XMVectorReciprocalSqrt( a );
-}
-
-FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
-{
- return __vrefp( a );
-}
-
-/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
-/// No error checking!
-FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
-{
- // This uses Newton-Raphson to improve the HW result
- return XMVectorReciprocal( a );
-}
-
-// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)
-FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
-{
- return MulSIMD( ReciprocalSIMD( b ), a );
-}
-
-/// 1/x for all 4 values.
-/// 1/0 will result in a big but NOT infinite result
-FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
-{
- // Convert zeros to epsilons
- fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
- fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
- return ReciprocalEstSIMD( a_safe );
-}
-
-FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
-{
- // Convert zeros to epsilons
- fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
- fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
- return ReciprocalSIMD( a_safe );
-
- // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
- // fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a );
- // fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask );
- // return ReciprocalSIMD( a_safe );
-}
-
-// CHRISG: is it worth doing integer bitfiddling for this?
-// 2^x for all values (the antilog)
-FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
-{
- return XMVectorExp(toPower);
-}
-
-// Clamps the components of a vector to a specified minimum and maximum range.
-FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
-{
- return XMVectorClamp(in, min, max);
-}
-
-FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
-{
- return XMLoadVector4( pSIMD );
-}
-
-// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
-FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
-{
- return XMLoadVector3( pSIMD );
-}
-
-FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
-{
- return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
-}
-
-// for the transitional class -- load a 3-by VectorAligned and squash its w component
-FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
-{
- fltx4 out = XMLoadVector3A(pSIMD.Base());
- // squelch w
- return __vrlimi( out, __vzero(), 1, 0 );
-}
-
-// for the transitional class -- load a 3-by VectorAligned and squash its w component
-FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD )
-{
- fltx4 out = XMLoadVector3A(pSIMD);
- // squelch w
- return __vrlimi( out, __vzero(), 1, 0 );
-}
-
-FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
-}
-
-FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
-{
- XMStoreVector4( pSIMD, a );
-}
-
-FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
-{
- XMStoreVector3( pSIMD, a );
-}
-
-
-// strongly typed -- for typechecking as we transition to SIMD
-FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
-{
- XMStoreVector3A(pSIMD->Base(),a);
-}
-
-
-// Fixed-point conversion and save as SIGNED INTS.
-// pDest->x = Int (vSrc.x)
-// note: some architectures have means of doing
-// fixed point conversion when the fix depth is
-// specified as an immediate.. but there is no way
-// to guarantee an immediate as a parameter to function
-// like this.
-FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
-{
- fltx4 asInt = __vctsxs( vSrc, 0 );
- XMStoreVector4A(pDest->Base(), asInt);
-}
-
-FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
-{
- XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );
- xyzwMatrix = XMMatrixTranspose( xyzwMatrix );
- x = xyzwMatrix.r[0];
- y = xyzwMatrix.r[1];
- z = xyzwMatrix.r[2];
- w = xyzwMatrix.r[3];
-}
-
-// Return one in the fastest way -- faster even than loading.
-FORCEINLINE fltx4 LoadZeroSIMD( void )
-{
- return XMVectorZero();
-}
-
-// Return one in the fastest way -- faster even than loading.
-FORCEINLINE fltx4 LoadOneSIMD( void )
-{
- return XMVectorSplatOne();
-}
-
-FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
-{
- return XMVectorSplatX( a );
-}
-
-FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
-{
- return XMVectorSplatY( a );
-}
-
-FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
-{
- return XMVectorSplatZ( a );
-}
-
-FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
-{
- return XMVectorSplatW( a );
-}
-
-FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
-{
- fltx4 result = __vrlimi(a, x, 8, 0);
- return result;
-}
-
-FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
-{
- fltx4 result = __vrlimi(a, y, 4, 0);
- return result;
-}
-
-FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
-{
- fltx4 result = __vrlimi(a, z, 2, 0);
- return result;
-}
-
-FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
-{
- fltx4 result = __vrlimi(a, w, 1, 0);
- return result;
-}
-
-FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
-{
- static int s_nVrlimiMask[4] = { 8, 4, 2, 1 };
- fltx4 val = ReplicateX4( flValue );
- fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0);
- return result;
-}
-
-FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
-{
- fltx4 compareOne = a;
- return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );
-}
-
-FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
-{
- fltx4 compareOne = a;
- return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
-}
-
-
-
-// find the lowest component of a.x, a.y, a.z,
-// and replicate it to the whole return value.
-// ignores a.w.
-// Though this is only five instructions long,
-// they are all dependent, making this stall city.
-// Forcing this inline should hopefully help with scheduling.
-FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
-{
- // a is [x,y,z,G] (where G is garbage)
- // rotate left by one
- fltx4 compareOne = a ;
- compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
- // compareOne is [y,z,G,G]
- fltx4 retval = MinSIMD( a, compareOne );
- // retVal is [min(x,y), min(y,z), G, G]
- compareOne = __vrlimi( compareOne, a, 8 , 2);
- // compareOne is [z, G, G, G]
- retval = MinSIMD( retval, compareOne );
- // retVal = [ min(min(x,y),z), G, G, G ]
-
- // splat the x component out to the whole vector and return
- return SplatXSIMD( retval );
-}
-
-// find the highest component of a.x, a.y, a.z,
-// and replicate it to the whole return value.
-// ignores a.w.
-// Though this is only five instructions long,
-// they are all dependent, making this stall city.
-// Forcing this inline should hopefully help with scheduling.
-FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
-{
- // a is [x,y,z,G] (where G is garbage)
- // rotate left by one
- fltx4 compareOne = a ;
- compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
- // compareOne is [y,z,G,G]
- fltx4 retval = MaxSIMD( a, compareOne );
- // retVal is [max(x,y), max(y,z), G, G]
- compareOne = __vrlimi( compareOne, a, 8 , 2);
- // compareOne is [z, G, G, G]
- retval = MaxSIMD( retval, compareOne );
- // retVal = [ max(max(x,y),z), G, G, G ]
-
- // splat the x component out to the whole vector and return
- return SplatXSIMD( retval );
-}
-
-
-// Transform many (horizontal) points in-place by a 3x4 matrix,
-// here already loaded onto three fltx4 registers.
-// The points must be stored as 16-byte aligned. They are points
-// and not vectors because we assume the w-component to be 1.
-// To spare yourself the annoyance of loading the matrix yourself,
-// use one of the overloads below.
-void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3);
-
-// Transform many (horizontal) points in-place by a 3x4 matrix.
-// The points must be stored as 16-byte aligned. They are points
-// and not vectors because we assume the w-component to be 1.
-// In this function, the matrix need not be aligned.
-FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
-{
- return TransformManyPointsBy(pVectors, numVectors,
- LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) );
-}
-
-// Transform many (horizontal) points in-place by a 3x4 matrix.
-// The points must be stored as 16-byte aligned. They are points
-// and not vectors because we assume the w-component to be 1.
-// In this function, the matrix must itself be aligned on a 16-byte
-// boundary.
-FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
-{
- return TransformManyPointsBy(pVectors, numVectors,
- LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) );
-}
-
-// ------------------------------------
-// INTEGER SIMD OPERATIONS.
-// ------------------------------------
-
-// Load 4 aligned words into a SIMD register
-FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
-{
- return XMLoadVector4A(pSIMD);
-}
-
-// Load 4 unaligned words into a SIMD register
-FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)
-{
- return XMLoadVector4( pSIMD );
-}
-
-// save into four words, 16-byte aligned
-FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
-}
-
-FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
-{
- *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
-}
-
-FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
-{
- XMStoreVector4(pSIMD, a);
-}
-
-
-// Take a fltx4 containing fixed-point uints and
-// return them as single precision floats. No
-// fixed point conversion is done.
-FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
-{
- return __vcfux( vSrcA, 0 );
-}
-
-
-// Take a fltx4 containing fixed-point sints and
-// return them as single precision floats. No
-// fixed point conversion is done.
-FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
-{
- return __vcfsx( vSrcA, 0 );
-}
-
-// Take a fltx4 containing fixed-point uints and
-// return them as single precision floats. Each uint
-// will be divided by 2^immed after conversion
-// (eg, this is fixed point math).
-/* as if:
- FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
- {
- return __vcfux( vSrcA, uImmed );
- }
-*/
-#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))
-
-// Take a fltx4 containing fixed-point sints and
-// return them as single precision floats. Each int
-// will be divided by 2^immed (eg, this is fixed point
-// math).
-/* as if:
- FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
- {
- return __vcfsx( vSrcA, uImmed );
- }
-*/
-#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))
-
-// set all components of a vector to a signed immediate int number.
-/* as if:
- FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
- {
- return __vspltisw( toImmediate );
- }
-*/
-#define IntSetImmediateSIMD(x) (__vspltisw(x))
-
-/*
- works on fltx4's as if they are four uints.
- the first parameter contains the words to be shifted,
- the second contains the amount to shift by AS INTS
-
- for i = 0 to 3
- shift = vSrcB_i*32:(i*32)+4
- vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
-*/
-FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)
-{
- return __vslw(vSrcA, vSrcB);
-}
-
-FORCEINLINE float SubFloat( const fltx4 & a, int idx )
-{
- // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
- const fltx4_union & a_union = (const fltx4_union &)a;
- return a_union.m128_f32[ idx ];
-}
-
-FORCEINLINE float & SubFloat( fltx4 & a, int idx )
-{
- fltx4_union & a_union = (fltx4_union &)a;
- return a_union.m128_f32[idx];
-}
-
-FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
-{
- fltx4 t = __vctuxs( a, 0 );
- const fltx4_union & a_union = (const fltx4_union &)t;
- return a_union.m128_u32[idx];
-}
-
-
-FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
-{
- const fltx4_union & a_union = (const fltx4_union &)a;
- return a_union.m128_u32[idx];
-}
-
-FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
-{
- fltx4_union & a_union = (fltx4_union &)a;
- return a_union.m128_u32[idx];
-}
-
-#else
-
-//---------------------------------------------------------------------
-// Intel/SSE implementation
-//---------------------------------------------------------------------
-
-FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
-{
- _mm_store_ps( pSIMD, a );
-}
-
-FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
-{
- _mm_storeu_ps( pSIMD, a );
-}
-
-
-FORCEINLINE fltx4 RotateLeft( const fltx4 & a );
-FORCEINLINE fltx4 RotateLeft2( const fltx4 & a );
-
-FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
-{
- _mm_store_ss(pSIMD, a);
- _mm_store_ss(pSIMD+1, RotateLeft(a));
- _mm_store_ss(pSIMD+2, RotateLeft2(a));
-}
-
-// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
-FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
-{
- StoreAlignedSIMD( pSIMD->Base(),a );
-}
-
-FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
-{
- return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) );
-}
-
-FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
-{
- return _mm_and_ps( a, b );
-}
-
-FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
-{
- return _mm_andnot_ps( a, b );
-}
-
-FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
-{
- return _mm_xor_ps( a, b );
-}
-
-FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
-{
- return _mm_or_ps( a, b );
-}
-
-// Squelch the w component of a vector to +0.0.
-// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
-FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
-{
- return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) );
-}
-
-// for the transitional class -- load a 3-by VectorAligned and squash its w component
-FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
-{
- return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) );
-}
-
-FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
-{
- return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
-}
-
-FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
-{
- return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
-}
-
-/// replicate a single 32 bit integer value to all 4 components of an m128
-FORCEINLINE fltx4 ReplicateIX4( int i )
-{
- fltx4 value = _mm_set_ss( * ( ( float *) &i ) );;
- return _mm_shuffle_ps( value, value, 0);
-}
-
-
-FORCEINLINE fltx4 ReplicateX4( float flValue )
-{
- __m128 value = _mm_set_ss( flValue );
- return _mm_shuffle_ps( value, value, 0 );
-}
-
-
-FORCEINLINE float SubFloat( const fltx4 & a, int idx )
-{
- // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
-#ifndef POSIX
- return a.m128_f32[ idx ];
-#else
- return (reinterpret_cast<float const *>(&a))[idx];
-#endif
-}
-
-FORCEINLINE float & SubFloat( fltx4 & a, int idx )
-{
-#ifndef POSIX
- return a.m128_f32[ idx ];
-#else
- return (reinterpret_cast<float *>(&a))[idx];
-#endif
-}
-
-FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
-{
- return (uint32)SubFloat(a,idx);
-}
-
-FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
-{
-#ifndef POSIX
- return a.m128_u32[idx];
-#else
- return (reinterpret_cast<uint32 const *>(&a))[idx];
-#endif
-}
-
-FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
-{
-#ifndef POSIX
- return a.m128_u32[idx];
-#else
- return (reinterpret_cast<uint32 *>(&a))[idx];
-#endif
-}
-
-// Return one in the fastest way -- on the x360, faster even than loading.
-FORCEINLINE fltx4 LoadZeroSIMD( void )
-{
- return Four_Zeros;
-}
-
-// Return one in the fastest way -- on the x360, faster even than loading.
-FORCEINLINE fltx4 LoadOneSIMD( void )
-{
- return Four_Ones;
-}
-
-FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
-{
- return OrSIMD(
- AndSIMD( ReplacementMask, NewValue ),
- AndNotSIMD( ReplacementMask, OldValue ) );
-}
-
-// remember, the SSE numbers its words 3 2 1 0
-// The way we want to specify shuffles is backwards from the default
-// MM_SHUFFLE_REV is in array index order (default is reversed)
-#define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a)
-
-FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a )
-{
- return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
-}
-
-FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a )
-{
- return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) );
-}
-
-FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a )
-{
- return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) );
-}
-
-FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a )
-{
- return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 3, 3, 3 ) );
-}
-
-FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
-{
- fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a );
- return result;
-}
-
-FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
-{
- fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a );
- return result;
-}
-
-FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
-{
- fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a );
- return result;
-}
-
-FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
-{
- fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a );
- return result;
-}
-
-FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
-{
- fltx4 val = ReplicateX4( flValue );
- fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a );
- return result;
-}
-
-// a b c d -> b c d a
-FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
-{
- return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) );
-}
-
-// a b c d -> c d a b
-FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
-{
- return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
-}
-
-// a b c d -> d a b c
-FORCEINLINE fltx4 RotateRight( const fltx4 & a )
-{
- return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 0, 3, 2, 1) );
-}
-
-// a b c d -> c d a b
-FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
-{
- return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) );
-}
-
-
-FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b
-{
- return _mm_add_ps( a, b );
-};
-
-FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
-{
- return _mm_sub_ps( a, b );
-};
-
-FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
-{
- return _mm_mul_ps( a, b );
-};
-
-FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
-{
- return _mm_div_ps( a, b );
-};
-
-FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
-{
- return AddSIMD( MulSIMD(a,b), c );
-}
-
-FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
-{
- return SubSIMD( c, MulSIMD(a,b) );
-};
-
-FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
-{
- fltx4 m = MulSIMD( a, b );
- float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 );
- return ReplicateX4( flDot );
-}
-
-FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
-{
- fltx4 m = MulSIMD( a, b );
- float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 );
- return ReplicateX4( flDot );
-}
-
-//TODO: implement as four-way Taylor series (see xbox implementation)
-FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
-{
- fltx4 result;
- SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
- SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
- SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
- SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
- return result;
-}
-
-FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
-{
- // FIXME: Make a fast SSE version
- SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
- SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
- SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
-}
-
-FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c
-{
- // FIXME: Make a fast SSE version
- SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
- SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
- SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
- SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
-}
-
-//TODO: implement as four-way Taylor series (see xbox implementation)
-FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
-{
- // FIXME: Make a fast SSE version
- fltx4 result;
- SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
- SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
- SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
- SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
- return result;
-}
-
-FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
-{
- fltx4 result;
- SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
- SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
- SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
- SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
- return result;
-}
-
-// tan^1(a/b) .. ie, pass sin in as a and cos in as b
-FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
-{
- fltx4 result;
- SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
- SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
- SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
- SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
- return result;
-}
-
-FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
-{
- return SubSIMD(LoadZeroSIMD(),a);
-}
-
-FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
-{
- return _mm_movemask_ps( a );
-}
-
-FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
-{
- return (0 != TestSignSIMD( a ));
-}
-
-FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
-{
- return _mm_cmpeq_ps( a, b );
-}
-
-FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
-{
- return _mm_cmpgt_ps( a, b );
-}
-
-FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
-{
- return _mm_cmpge_ps( a, b );
-}
-
-FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
-{
- return _mm_cmplt_ps( a, b );
-}
-
-FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
-{
- return _mm_cmple_ps( a, b );
-}
-
-// for branching when a.xyzw > b.xyzw
-FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
-{
- return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0;
-}
-
-// for branching when a.xyzw >= b.xyzw
-FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
-{
- return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0;
-}
-
-// For branching if all a.xyzw == b.xyzw
-FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
-{
- return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf;
-}
-
-FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
-{
- return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) );
-}
-
-FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
-{
- return _mm_min_ps( a, b );
-}
-
-FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
-{
- return _mm_max_ps( a, b );
-}
-
-
-
-// SSE lacks rounding operations.
-// Really.
-// You can emulate them by setting the rounding mode for the
-// whole processor and then converting to int, and then back again.
-// But every time you set the rounding mode, you clear out the
-// entire pipeline. So, I can't do them per operation. You
-// have to do it once, before the loop that would call these.
-// Round towards positive infinity
-FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
-{
- fltx4 retVal;
- SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
- SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
- SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
- SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
- return retVal;
-
-}
-
-fltx4 fabs( const fltx4 & x );
-// Round towards negative infinity
-// This is the implementation that was here before; it assumes
-// you are in round-to-floor mode, which I guess is usually the
-// case for us vis-a-vis SSE. It's totally unnecessary on
-// VMX, which has a native floor op.
-FORCEINLINE fltx4 FloorSIMD( const fltx4 &val )
-{
- fltx4 fl4Abs = fabs( val );
- fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
- ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
- return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits
-}
-
-
-
-inline bool IsAllZeros( const fltx4 & var )
-{
- return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF;
-}
-
-FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
-{
- return _mm_sqrt_ps( a );
-}
-
-FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
-{
- return _mm_sqrt_ps( a );
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
-{
- return _mm_rsqrt_ps( a );
-}
-
-FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
-{
- fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
- fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
- ret = ReciprocalSqrtEstSIMD( ret );
- return ret;
-}
-
-/// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD
-FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
-{
- fltx4 guess = ReciprocalSqrtEstSIMD( a );
- // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
- guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess ))));
- guess = MulSIMD( Four_PointFives, guess);
- return guess;
-}
-
-FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
-{
- return _mm_rcp_ps( a );
-}
-
-/// 1/x for all 4 values, more or less
-/// 1/0 will result in a big but NOT infinite result
-FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
-{
- fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
- fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
- ret = ReciprocalEstSIMD( ret );
- return ret;
-}
-
-/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
-/// No error checking!
-FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
-{
- fltx4 ret = ReciprocalEstSIMD( a );
- // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
- ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) );
- return ret;
-}
-
-/// 1/x for all 4 values.
-/// 1/0 will result in a big but NOT infinite result
-FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
-{
- fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
- fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
- ret = ReciprocalSIMD( ret );
- return ret;
-}
-
-// CHRISG: is it worth doing integer bitfiddling for this?
-// 2^x for all values (the antilog)
-FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
-{
- fltx4 retval;
- SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) );
- SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) );
- SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) );
- SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) );
-
- return retval;
-}
-
-// Clamps the components of a vector to a specified minimum and maximum range.
-FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
-{
- return MaxSIMD( min, MinSIMD( max, in ) );
-}
-
-FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w)
-{
- _MM_TRANSPOSE4_PS( x, y, z, w );
-}
-
-FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a )
-{
- // a is [x,y,z,G] (where G is garbage)
- // rotate left by one
- fltx4 compareOne = RotateLeft( a );
- // compareOne is [y,z,G,x]
- fltx4 retval = MinSIMD( a, compareOne );
- // retVal is [min(x,y), ... ]
- compareOne = RotateLeft2( a );
- // compareOne is [z, G, x, y]
- retval = MinSIMD( retval, compareOne );
- // retVal = [ min(min(x,y),z)..]
- // splat the x component out to the whole vector and return
- return SplatXSIMD( retval );
-
-}
-
-FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a )
-{
- // a is [x,y,z,G] (where G is garbage)
- // rotate left by one
- fltx4 compareOne = RotateLeft( a );
- // compareOne is [y,z,G,x]
- fltx4 retval = MaxSIMD( a, compareOne );
- // retVal is [max(x,y), ... ]
- compareOne = RotateLeft2( a );
- // compareOne is [z, G, x, y]
- retval = MaxSIMD( retval, compareOne );
- // retVal = [ max(max(x,y),z)..]
- // splat the x component out to the whole vector and return
- return SplatXSIMD( retval );
-
-}
-
-// ------------------------------------
-// INTEGER SIMD OPERATIONS.
-// ------------------------------------
-
-
-#if 0 /* pc does not have these ops */
-// splat all components of a vector to a signed immediate int number.
-FORCEINLINE fltx4 IntSetImmediateSIMD(int to)
-{
- //CHRISG: SSE2 has this, but not SSE1. What to do?
- fltx4 retval;
- SubInt( retval, 0 ) = to;
- SubInt( retval, 1 ) = to;
- SubInt( retval, 2 ) = to;
- SubInt( retval, 3 ) = to;
- return retval;
-}
-#endif
-
-// Load 4 aligned words into a SIMD register
-FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
-{
- return _mm_load_ps( reinterpret_cast<const float *>(pSIMD) );
-}
-
-// Load 4 unaligned words into a SIMD register
-FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
-{
- return _mm_loadu_ps( reinterpret_cast<const float *>(pSIMD) );
-}
-
-// save into four words, 16-byte aligned
-FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
-{
- _mm_store_ps( reinterpret_cast<float *>(pSIMD), a );
-}
-
-FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
-{
- _mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a );
-}
-
-FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
-{
- _mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a );
-}
-
-
-// CHRISG: the conversion functions all seem to operate on m64's only...
-// how do we make them work here?
-
-// Take a fltx4 containing fixed-point uints and
-// return them as single precision floats. No
-// fixed point conversion is done.
-FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
-{
- fltx4 retval;
- SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
- SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
- SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
- SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
- return retval;
-}
-
-
-// Take a fltx4 containing fixed-point sints and
-// return them as single precision floats. No
-// fixed point conversion is done.
-FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
-{
- fltx4 retval;
- SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0]));
- SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1]));
- SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2]));
- SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3]));
- return retval;
-}
-
-/*
- works on fltx4's as if they are four uints.
- the first parameter contains the words to be shifted,
- the second contains the amount to shift by AS INTS
-
- for i = 0 to 3
- shift = vSrcB_i*32:(i*32)+4
- vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
-*/
-FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
-{
- i32x4 retval;
- SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
- SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
- SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
- SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
-
-
- return retval;
-}
-
-
-// Fixed-point conversion and save as SIGNED INTS.
-// pDest->x = Int (vSrc.x)
-// note: some architectures have means of doing
-// fixed point conversion when the fix depth is
-// specified as an immediate.. but there is no way
-// to guarantee an immediate as a parameter to function
-// like this.
-FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
-{
- __m64 bottom = _mm_cvttps_pi32( vSrc );
- __m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) );
-
- *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom;
- *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top;
-
- _mm_empty();
-}
-
-
-
-#endif
-
-
-
-/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
-/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
-class ALIGN16 FourVectors
-{
-public:
- fltx4 x, y, z;
-
- FORCEINLINE void DuplicateVector(Vector const &v) //< set all 4 vectors to the same vector value
- {
- x=ReplicateX4(v.x);
- y=ReplicateX4(v.y);
- z=ReplicateX4(v.z);
- }
-
- FORCEINLINE fltx4 const & operator[](int idx) const
- {
- return *((&x)+idx);
- }
-
- FORCEINLINE fltx4 & operator[](int idx)
- {
- return *((&x)+idx);
- }
-
- FORCEINLINE void operator+=(FourVectors const &b) //< add 4 vectors to another 4 vectors
- {
- x=AddSIMD(x,b.x);
- y=AddSIMD(y,b.y);
- z=AddSIMD(z,b.z);
- }
-
- FORCEINLINE void operator-=(FourVectors const &b) //< subtract 4 vectors from another 4
- {
- x=SubSIMD(x,b.x);
- y=SubSIMD(y,b.y);
- z=SubSIMD(z,b.z);
- }
-
- FORCEINLINE void operator*=(FourVectors const &b) //< scale all four vectors per component scale
- {
- x=MulSIMD(x,b.x);
- y=MulSIMD(y,b.y);
- z=MulSIMD(z,b.z);
- }
-
- FORCEINLINE void operator*=(const fltx4 & scale) //< scale
- {
- x=MulSIMD(x,scale);
- y=MulSIMD(y,scale);
- z=MulSIMD(z,scale);
- }
-
- FORCEINLINE void operator*=(float scale) //< uniformly scale all 4 vectors
- {
- fltx4 scalepacked = ReplicateX4(scale);
- *this *= scalepacked;
- }
-
- FORCEINLINE fltx4 operator*(FourVectors const &b) const //< 4 dot products
- {
- fltx4 dot=MulSIMD(x,b.x);
- dot=MaddSIMD(y,b.y,dot);
- dot=MaddSIMD(z,b.z,dot);
- return dot;
- }
-
- FORCEINLINE fltx4 operator*(Vector const &b) const //< dot product all 4 vectors with 1 vector
- {
- fltx4 dot=MulSIMD(x,ReplicateX4(b.x));
- dot=MaddSIMD(y,ReplicateX4(b.y), dot);
- dot=MaddSIMD(z,ReplicateX4(b.z), dot);
- return dot;
- }
-
- FORCEINLINE void VProduct(FourVectors const &b) //< component by component mul
- {
- x=MulSIMD(x,b.x);
- y=MulSIMD(y,b.y);
- z=MulSIMD(z,b.z);
- }
- FORCEINLINE void MakeReciprocal(void) //< (x,y,z)=(1/x,1/y,1/z)
- {
- x=ReciprocalSIMD(x);
- y=ReciprocalSIMD(y);
- z=ReciprocalSIMD(z);
- }
-
- FORCEINLINE void MakeReciprocalSaturate(void) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23
- {
- x=ReciprocalSaturateSIMD(x);
- y=ReciprocalSaturateSIMD(y);
- z=ReciprocalSaturateSIMD(z);
- }
-
- // Assume the given matrix is a rotation, and rotate these vectors by it.
- // If you have a long list of FourVectors structures that you all want
- // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
- inline void RotateBy(const matrix3x4_t& matrix);
-
- /// You can use this to rotate a long array of FourVectors all by the same
- /// matrix. The first parameter is the head of the array. The second is the
- /// number of vectors to rotate. The third is the matrix.
- static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
-
- /// Assume the vectors are points, and transform them in place by the matrix.
- inline void TransformBy(const matrix3x4_t& matrix);
-
- /// You can use this to Transform a long array of FourVectors all by the same
- /// matrix. The first parameter is the head of the array. The second is the
- /// number of vectors to rotate. The third is the matrix. The fourth is the
- /// output buffer, which must not overlap the pVectors buffer. This is not
- /// an in-place transformation.
- static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut );
-
- /// You can use this to Transform a long array of FourVectors all by the same
- /// matrix. The first parameter is the head of the array. The second is the
- /// number of vectors to rotate. The third is the matrix. The fourth is the
- /// output buffer, which must not overlap the pVectors buffer.
- /// This is an in-place transformation.
- static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
-
- // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector.
- FORCEINLINE const float & X(int idx) const
- {
- // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
- return SubFloat( (fltx4 &)x, idx );
- }
-
- FORCEINLINE const float & Y(int idx) const
- {
- return SubFloat( (fltx4 &)y, idx );
- }
-
- FORCEINLINE const float & Z(int idx) const
- {
- return SubFloat( (fltx4 &)z, idx );
- }
-
- FORCEINLINE float & X(int idx)
- {
- return SubFloat( x, idx );
- }
-
- FORCEINLINE float & Y(int idx)
- {
- return SubFloat( y, idx );
- }
-
- FORCEINLINE float & Z(int idx)
- {
- return SubFloat( z, idx );
- }
-
- FORCEINLINE Vector Vec(int idx) const //< unpack one of the vectors
- {
- return Vector( X(idx), Y(idx), Z(idx) );
- }
-
- FourVectors(void)
- {
- }
-
- FourVectors( FourVectors const &src )
- {
- x=src.x;
- y=src.y;
- z=src.z;
- }
-
- FORCEINLINE void operator=( FourVectors const &src )
- {
- x=src.x;
- y=src.y;
- z=src.z;
- }
-
- /// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op
- FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
- {
- // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
- // use an unfolded implementation here
-#if _X360
- fltx4 tx = LoadUnalignedSIMD( &a.x );
- fltx4 ty = LoadUnalignedSIMD( &b.x );
- fltx4 tz = LoadUnalignedSIMD( &c.x );
- fltx4 tw = LoadUnalignedSIMD( &d.x );
- fltx4 r0 = __vmrghw(tx, tz);
- fltx4 r1 = __vmrghw(ty, tw);
- fltx4 r2 = __vmrglw(tx, tz);
- fltx4 r3 = __vmrglw(ty, tw);
-
- x = __vmrghw(r0, r1);
- y = __vmrglw(r0, r1);
- z = __vmrghw(r2, r3);
-#else
- x = LoadUnalignedSIMD( &( a.x ));
- y = LoadUnalignedSIMD( &( b.x ));
- z = LoadUnalignedSIMD( &( c.x ));
- fltx4 w = LoadUnalignedSIMD( &( d.x ));
- // now, matrix is:
- // x y z ?
- // x y z ?
- // x y z ?
- // x y z ?
- TransposeSIMD(x, y, z, w);
-#endif
- }
-
- /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op.
- /// all 4 vectors must be 128 bit boundary
- FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d)
- {
-#if _X360
- fltx4 tx = LoadAlignedSIMD(a);
- fltx4 ty = LoadAlignedSIMD(b);
- fltx4 tz = LoadAlignedSIMD(c);
- fltx4 tw = LoadAlignedSIMD(d);
- fltx4 r0 = __vmrghw(tx, tz);
- fltx4 r1 = __vmrghw(ty, tw);
- fltx4 r2 = __vmrglw(tx, tz);
- fltx4 r3 = __vmrglw(ty, tw);
-
- x = __vmrghw(r0, r1);
- y = __vmrglw(r0, r1);
- z = __vmrghw(r2, r3);
-#else
- x = LoadAlignedSIMD( a );
- y = LoadAlignedSIMD( b );
- z = LoadAlignedSIMD( c );
- fltx4 w = LoadAlignedSIMD( d );
- // now, matrix is:
- // x y z ?
- // x y z ?
- // x y z ?
- // x y z ?
- TransposeSIMD( x, y, z, w );
-#endif
- }
-
- FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
- {
- LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x );
- }
-
- /// return the squared length of all 4 vectors
- FORCEINLINE fltx4 length2(void) const
- {
- return (*this)*(*this);
- }
-
- /// return the approximate length of all 4 vectors. uses the sqrt approximation instruction
- FORCEINLINE fltx4 length(void) const
- {
- return SqrtEstSIMD(length2());
- }
-
- /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
- FORCEINLINE void VectorNormalizeFast(void)
- {
- fltx4 mag_sq=(*this)*(*this); // length^2
- (*this) *= ReciprocalSqrtEstSIMD(mag_sq); // *(1.0/sqrt(length^2))
- }
-
- /// normalize all 4 vectors in place.
- FORCEINLINE void VectorNormalize(void)
- {
- fltx4 mag_sq=(*this)*(*this); // length^2
- (*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2))
- }
-
- /// construct a FourVectors from 4 separate Vectors
- FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
- {
- LoadAndSwizzle(a,b,c,d);
- }
-
- /// construct a FourVectors from 4 separate Vectors
- FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d)
- {
- LoadAndSwizzleAligned(a,b,c,d);
- }
-
- FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt )
- {
- fltx4 fl4dX = SubSIMD( pnt.x, x );
- fltx4 fl4dY = SubSIMD( pnt.y, y );
- fltx4 fl4dZ = SubSIMD( pnt.z, z );
- return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) );
-
- }
-
- FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const
- {
- FourVectors lineDelta = p1;
- lineDelta -= p0;
- fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 );
- FourVectors v4OurPnt = *this;
- v4OurPnt -= p0;
- return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
- }
-
- FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const
- {
- FourVectors lineDelta = p1;
- FourVectors v4OurPnt = *this;
- v4OurPnt -= p0;
- lineDelta -= p0;
-
- fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta );
-
- fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
-
- fl4T = MinSIMD( fl4T, Four_Ones );
- fl4T = MaxSIMD( fl4T, Four_Zeros );
- lineDelta *= fl4T;
- return v4OurPnt.DistToSqr( lineDelta );
- }
-
-};
-
-/// form 4 cross products
-inline FourVectors operator ^(const FourVectors &a, const FourVectors &b)
-{
- FourVectors ret;
- ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y));
- ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z));
- ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x));
- return ret;
-}
-
-/// component-by-componentwise MAX operator
-inline FourVectors maximum(const FourVectors &a, const FourVectors &b)
-{
- FourVectors ret;
- ret.x=MaxSIMD(a.x,b.x);
- ret.y=MaxSIMD(a.y,b.y);
- ret.z=MaxSIMD(a.z,b.z);
- return ret;
-}
-
-/// component-by-componentwise MIN operator
-inline FourVectors minimum(const FourVectors &a, const FourVectors &b)
-{
- FourVectors ret;
- ret.x=MinSIMD(a.x,b.x);
- ret.y=MinSIMD(a.y,b.y);
- ret.z=MinSIMD(a.z,b.z);
- return ret;
-}
-
-/// calculate reflection vector. incident and normal dir assumed normalized
-FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal )
-{
- FourVectors ret = incident;
- fltx4 iDotNx2 = incident * normal;
- iDotNx2 = AddSIMD( iDotNx2, iDotNx2 );
- FourVectors nPart = normal;
- nPart *= iDotNx2;
- ret -= nPart; // i-2(n*i)n
- return ret;
-}
-
-/// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector.
-FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal )
-{
- FourVectors ret = incident;
- fltx4 iDotN = incident * normal;
- FourVectors nPart = normal;
- nPart *= iDotN;
- ret -= nPart; // i-(n*i)n
- return ret;
-}
-
-
-// Assume the given matrix is a rotation, and rotate these vectors by it.
-// If you have a long list of FourVectors structures that you all want
-// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
-void FourVectors::RotateBy(const matrix3x4_t& matrix)
-{
- // Splat out each of the entries in the matrix to a fltx4. Do this
- // in the order that we will need them, to hide latency. I'm
- // avoiding making an array of them, so that they'll remain in
- // registers.
- fltx4 matSplat00, matSplat01, matSplat02,
- matSplat10, matSplat11, matSplat12,
- matSplat20, matSplat21, matSplat22;
-
- {
- // Load the matrix into local vectors. Sadly, matrix3x4_ts are
- // often unaligned. The w components will be the tranpose row of
- // the matrix, but we don't really care about that.
- fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
- fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
- fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
-
- matSplat00 = SplatXSIMD( matCol0 );
- matSplat01 = SplatYSIMD( matCol0 );
- matSplat02 = SplatZSIMD( matCol0 );
-
- matSplat10 = SplatXSIMD( matCol1 );
- matSplat11 = SplatYSIMD( matCol1 );
- matSplat12 = SplatZSIMD( matCol1 );
-
- matSplat20 = SplatXSIMD( matCol2 );
- matSplat21 = SplatYSIMD( matCol2 );
- matSplat22 = SplatZSIMD( matCol2 );
- }
-
- // Trust in the compiler to schedule these operations correctly:
- fltx4 outX, outY, outZ;
- outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) );
- outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) );
- outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) );
-
- x = outX;
- y = outY;
- z = outZ;
-}
-
-// Assume the given matrix is a rotation, and rotate these vectors by it.
-// If you have a long list of FourVectors structures that you all want
-// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
-void FourVectors::TransformBy(const matrix3x4_t& matrix)
-{
- // Splat out each of the entries in the matrix to a fltx4. Do this
- // in the order that we will need them, to hide latency. I'm
- // avoiding making an array of them, so that they'll remain in
- // registers.
- fltx4 matSplat00, matSplat01, matSplat02,
- matSplat10, matSplat11, matSplat12,
- matSplat20, matSplat21, matSplat22;
-
- {
- // Load the matrix into local vectors. Sadly, matrix3x4_ts are
- // often unaligned. The w components will be the tranpose row of
- // the matrix, but we don't really care about that.
- fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
- fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
- fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
-
- matSplat00 = SplatXSIMD( matCol0 );
- matSplat01 = SplatYSIMD( matCol0 );
- matSplat02 = SplatZSIMD( matCol0 );
-
- matSplat10 = SplatXSIMD( matCol1 );
- matSplat11 = SplatYSIMD( matCol1 );
- matSplat12 = SplatZSIMD( matCol1 );
-
- matSplat20 = SplatXSIMD( matCol2 );
- matSplat21 = SplatYSIMD( matCol2 );
- matSplat22 = SplatZSIMD( matCol2 );
- }
-
- // Trust in the compiler to schedule these operations correctly:
- fltx4 outX, outY, outZ;
-
- outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) );
- outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) );
- outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) );
-
- x = AddSIMD( outX, ReplicateX4( matrix[0][3] ));
- y = AddSIMD( outY, ReplicateX4( matrix[1][3] ));
- z = AddSIMD( outZ, ReplicateX4( matrix[2][3] ));
-}
-
-
-
-/// quick, low quality perlin-style noise() function suitable for real time use.
-/// return value is -1..1. Only reliable around +/- 1 million or so.
-fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z );
-fltx4 NoiseSIMD( FourVectors const &v );
-
-// vector valued noise direction
-FourVectors DNoiseSIMD( FourVectors const &v );
-
-// vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html
-FourVectors CurlNoiseSIMD( FourVectors const &v );
-
-
-/// calculate the absolute value of a packed single
-inline fltx4 fabs( const fltx4 & x )
-{
- return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) );
-}
-
-/// negate all four components of a SIMD packed single
-inline fltx4 fnegate( const fltx4 & x )
-{
- return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) );
-}
-
-
-fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent);
-
-// PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some
-// restictions: fractional exponents are only handled with 2 bits of precision. Basically,
-// fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25).
-// negative and fractional powers are handled by the SIMD reciprocal and square root approximation
-// instructions and so are not especially accurate ----Note that this routine does not raise
-// numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)).
-inline fltx4 PowSIMD( const fltx4 & x, float exponent )
-{
- return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent));
-}
-
-
-
-// random number generation - generate 4 random numbers quickly.
-
-void SeedRandSIMD(uint32 seed); // seed the random # generator
-fltx4 RandSIMD( int nContext = 0 ); // return 4 numbers in the 0..1 range
-
-// for multithreaded, you need to use these and use the argument form of RandSIMD:
-int GetSIMDRandContext( void );
-void ReleaseSIMDRandContext( int nContext );
-
-FORCEINLINE fltx4 RandSignedSIMD( void ) // -1..1
-{
- return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones );
-}
-
-
-// SIMD versions of mathlib simplespline functions
-// hermite basis function for smooth interpolation
-// Similar to Gain() above, but very cheap to call
-// value should be between 0 & 1 inclusive
-inline fltx4 SimpleSpline( const fltx4 & value )
-{
- // Arranged to avoid a data dependency between these two MULs:
- fltx4 valueDoubled = MulSIMD( value, Four_Twos );
- fltx4 valueSquared = MulSIMD( value, value );
-
- // Nice little ease-in, ease-out spline-like curve
- return SubSIMD(
- MulSIMD( Four_Threes, valueSquared ),
- MulSIMD( valueDoubled, valueSquared ) );
-}
-
-// remaps a value in [startInterval, startInterval+rangeInterval] from linear to
-// spline using SimpleSpline
-inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val,
- const fltx4 & A, const fltx4 & BMinusA,
- const fltx4 & OneOverBMinusA, const fltx4 & C,
- const fltx4 & DMinusC )
-{
-// if ( A == B )
-// return val >= B ? D : C;
- fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
- return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
-}
-
-inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val,
- const fltx4 & A, const fltx4 & BMinusA,
- const fltx4 & OneOverBMinusA, const fltx4 & C,
- const fltx4 & DMinusC )
-{
-// if ( A == B )
-// return val >= B ? D : C;
- fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
- cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) );
- return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
-}
-
-FORCEINLINE fltx4 FracSIMD( const fltx4 &val )
-{
- fltx4 fl4Abs = fabs( val );
- fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
- ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
- return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits
-}
-
-FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val )
-{
- fltx4 fl4Abs = fabs( val );
- fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s );
- ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival );
- return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits
-}
-
-FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val )
-{
- fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s );
- ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival );
- return SubSIMD( val, ival );
-}
-
-
-// approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi.
-// no range reduction is done - for values outside of 0..1 you won't like the results
-FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val )
-{
- // really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between.
- // sufficient for simple oscillation.
- return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
-}
-
-FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val )
-{
- // not a bad approximation : parabola always over-estimates. Squared parabola always
- // underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin)
- fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
- return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst );
-}
-
-// full range useable implementations
-FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val )
-{
- fltx4 fl4Abs = fabs( val );
- fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
- fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
- fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
- fltx4 fl4Sin = _SinEst01SIMD( fl4val );
- fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
- return fl4Sin;
-
-}
-
-FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val )
-{
- fltx4 fl4Abs = fabs( val );
- fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
- fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
- fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
- fltx4 fl4Sin = _Sin01SIMD( fl4val );
- fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
- return fl4Sin;
-
-}
-
-// Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1)
-
-FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter )
-{
- // convert perlin-style-bias parameter to the value right for the approximation
- return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos );
-}
-
-FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param )
-{
- // similar to bias function except pass precalced bias value from calling PreCalcBiasParameter.
-
- //!!speed!! use reciprocal est?
- //!!speed!! could save one op by precalcing _2_ values
- return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) );
-}
-
-//-----------------------------------------------------------------------------
-// Box/plane test
-// NOTE: The w component of emins + emaxs must be 1 for this to work
-//-----------------------------------------------------------------------------
-FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f )
-{
- fltx4 corners[2];
- fltx4 normal = LoadUnalignedSIMD( p->normal.Base() );
- fltx4 dist = ReplicateX4( -p->dist );
- normal = SetWSIMD( normal, dist );
- fltx4 t4 = ReplicateX4( tolerance );
- fltx4 negt4 = ReplicateX4( -tolerance );
- fltx4 cmp = CmpGeSIMD( normal, Four_Zeros );
- corners[0] = MaskedAssign( cmp, emaxs, emins );
- corners[1] = MaskedAssign( cmp, emins, emaxs );
- fltx4 dot1 = Dot4SIMD( normal, corners[0] );
- fltx4 dot2 = Dot4SIMD( normal, corners[1] );
- cmp = CmpGeSIMD( dot1, t4 );
- fltx4 cmp2 = CmpGtSIMD( negt4, dot2 );
- fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros );
- fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros );
- result = AddSIMD( result, result2 );
- intx4 sides;
- ConvertStoreAsIntsSIMD( &sides, result );
- return sides[0];
-}
-
-#endif // _ssemath_h
+//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: - defines SIMD "structure of arrays" classes and functions. +// +//===========================================================================// +#ifndef SSEMATH_H +#define SSEMATH_H + +#if defined( _X360 ) +#include <xboxmath.h> +#else +#include <xmmintrin.h> +#endif + +#include <mathlib/vector.h> +#include <mathlib/mathlib.h> + +#if defined(GNUC) +#define USE_STDC_FOR_SIMD 0 +#else +#define USE_STDC_FOR_SIMD 0 +#endif + +#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0)) +#define _SSE1 1 +#endif + +// I thought about defining a class/union for the SIMD packed floats instead of using fltx4, +// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur +// the relationship between packed floats and packed integer types and (b) not sure that the +// compiler would handle generating good code for the intrinsics. + +#if USE_STDC_FOR_SIMD + +typedef union +{ + float m128_f32[4]; + uint32 m128_u32[4]; +} fltx4; + +typedef fltx4 i32x4; +typedef fltx4 u32x4; + +#elif ( defined( _X360 ) ) + +typedef union +{ + // This union allows float/int access (which generally shouldn't be done in inner loops) + __vector4 vmx; + float m128_f32[4]; + uint32 m128_u32[4]; +} fltx4_union; + +typedef __vector4 fltx4; +typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops. +typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops. + +#else + +typedef __m128 fltx4; +typedef __m128 i32x4; +typedef __m128 u32x4; + +#endif + +// The FLTX4 type is a fltx4 used as a parameter to a function. +// On the 360, the best way to do this is pass-by-copy on the registers. +// On the PC, the best way is to pass by const reference. +// The compiler will sometimes, but not always, replace a pass-by-const-ref +// with a pass-in-reg on the 360; to avoid this confusion, you can +// explicitly use a FLTX4 as the parameter type. +#ifdef _X360 +typedef __vector4 FLTX4; +#else +typedef const fltx4 & FLTX4; +#endif + +// A 16-byte aligned int32 datastructure +// (for use when writing out fltx4's as SIGNED +// ints). +struct ALIGN16 intx4 +{ + int32 m_i32[4]; + + inline int & operator[](int which) + { + return m_i32[which]; + } + + inline const int & operator[](int which) const + { + return m_i32[which]; + } + + inline int32 *Base() { + return m_i32; + } + + inline const int32 *Base() const + { + return m_i32; + } + + inline const bool operator==(const intx4 &other) const + { + return m_i32[0] == other.m_i32[0] && + m_i32[1] == other.m_i32[1] && + m_i32[2] == other.m_i32[2] && + m_i32[3] == other.m_i32[3] ; + } +} ALIGN16_POST; + + +#if defined( _DEBUG ) && defined( _X360 ) +FORCEINLINE void TestVPUFlags() +{ + // Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com) + __vector4 a; + __asm + { + mfvscr a; + } + unsigned int * flags = (unsigned int *)&a; + unsigned int controlWord = flags[3]; + Assert(controlWord == 0); +} +#else // _DEBUG +FORCEINLINE void TestVPUFlags() {} +#endif // _DEBUG + + +// useful constants in SIMD packed float format: +// (note: some of these aren't stored on the 360, +// but are manufactured directly in one or two +// instructions, saving a load and possible L2 +// miss.) +#ifndef _X360 +extern const fltx4 Four_Zeros; // 0 0 0 0 +extern const fltx4 Four_Ones; // 1 1 1 1 +extern const fltx4 Four_Twos; // 2 2 2 2 +extern const fltx4 Four_Threes; // 3 3 3 3 +extern const fltx4 Four_Fours; // guess. +extern const fltx4 Four_Point225s; // .225 .225 .225 .225 +extern const fltx4 Four_PointFives; // .5 .5 .5 .5 +extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON +extern const fltx4 Four_2ToThe21s; // (1<<21).. +extern const fltx4 Four_2ToThe22s; // (1<<22).. +extern const fltx4 Four_2ToThe23s; // (1<<23).. +extern const fltx4 Four_2ToThe24s; // (1<<24).. +extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) +extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +#else +#define Four_Zeros XMVectorZero() // 0 0 0 0 +#define Four_Ones XMVectorSplatOne() // 1 1 1 1 +extern const fltx4 Four_Twos; // 2 2 2 2 +extern const fltx4 Four_Threes; // 3 3 3 3 +extern const fltx4 Four_Fours; // guess. +extern const fltx4 Four_Point225s; // .225 .225 .225 .225 +extern const fltx4 Four_PointFives; // .5 .5 .5 .5 +extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON +extern const fltx4 Four_2ToThe21s; // (1<<21).. +extern const fltx4 Four_2ToThe22s; // (1<<22).. +extern const fltx4 Four_2ToThe23s; // (1<<23).. +extern const fltx4 Four_2ToThe24s; // (1<<24).. +extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) +extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +#endif +extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX +extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX +extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float + +// external aligned integer constants +extern const ALIGN16 int32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4 +extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4 +extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4 +extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0 +extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF] +extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0 +extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4 + +// this mask is used for skipping the tail of things. If you have N elements in an array, and wish +// to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration. +extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; + +// Define prefetch macros. +// The characteristics of cache and prefetch are completely +// different between the different platforms, so you DO NOT +// want to just define one macro that maps to every platform +// intrinsic under the hood -- you need to prefetch at different +// intervals between x86 and PPC, for example, and that is +// a higher level code change. +// On the other hand, I'm tired of typing #ifdef _X360 +// all over the place, so this is just a nop on Intel, PS3. +#ifdef _X360 +#define PREFETCH360(address, offset) __dcbt(offset,address) +#else +#define PREFETCH360(x,y) // nothing +#endif + +#if USE_STDC_FOR_SIMD + +//--------------------------------------------------------------------- +// Standard C (fallback/Linux) implementation (only there for compat - slow) +//--------------------------------------------------------------------- + +FORCEINLINE float SubFloat( const fltx4 & a, int idx ) +{ + return a.m128_f32[ idx ]; +} + +FORCEINLINE float & SubFloat( fltx4 & a, int idx ) +{ + return a.m128_f32[idx]; +} + +FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) +{ + return a.m128_u32[idx]; +} + +FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) +{ + return a.m128_u32[idx]; +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadZeroSIMD( void ) +{ + return Four_Zeros; +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadOneSIMD( void ) +{ + return Four_Ones; +} + +FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 0 ); + SubFloat( retVal, 1 ) = SubFloat( a, 0 ); + SubFloat( retVal, 2 ) = SubFloat( a, 0 ); + SubFloat( retVal, 3 ) = SubFloat( a, 0 ); + return retVal; +} + +FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 1 ); + SubFloat( retVal, 1 ) = SubFloat( a, 1 ); + SubFloat( retVal, 2 ) = SubFloat( a, 1 ); + SubFloat( retVal, 3 ) = SubFloat( a, 1 ); + return retVal; +} + +FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 2 ); + SubFloat( retVal, 1 ) = SubFloat( a, 2 ); + SubFloat( retVal, 2 ) = SubFloat( a, 2 ); + SubFloat( retVal, 3 ) = SubFloat( a, 2 ); + return retVal; +} + +FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 3 ); + SubFloat( retVal, 1 ) = SubFloat( a, 3 ); + SubFloat( retVal, 2 ) = SubFloat( a, 3 ); + SubFloat( retVal, 3 ) = SubFloat( a, 3 ); + return retVal; +} + +FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) +{ + fltx4 result = a; + SubFloat( result, 0 ) = SubFloat( x, 0 ); + return result; +} + +FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) +{ + fltx4 result = a; + SubFloat( result, 1 ) = SubFloat( y, 1 ); + return result; +} + +FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) +{ + fltx4 result = a; + SubFloat( result, 2 ) = SubFloat( z, 2 ); + return result; +} + +FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) +{ + fltx4 result = a; + SubFloat( result, 3 ) = SubFloat( w, 3 ); + return result; +} + +FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) +{ + fltx4 result = a; + SubFloat( result, nComponent ) = flValue; + return result; +} + +// a b c d -> b c d a +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 1 ); + SubFloat( retVal, 1 ) = SubFloat( a, 2 ); + SubFloat( retVal, 2 ) = SubFloat( a, 3 ); + SubFloat( retVal, 3 ) = SubFloat( a, 0 ); + return retVal; +} + +// a b c d -> c d a b +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 2 ); + SubFloat( retVal, 1 ) = SubFloat( a, 3 ); + SubFloat( retVal, 2 ) = SubFloat( a, 0 ); + SubFloat( retVal, 3 ) = SubFloat( a, 1 ); + return retVal; +} + +#define BINOP(op) \ + fltx4 retVal; \ + SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \ + SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \ + SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \ + SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \ + return retVal; + +#define IBINOP(op) \ + fltx4 retVal; \ + SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \ + SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \ + SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \ + SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \ + return retVal; + +FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) +{ + BINOP(+); +} + +FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b +{ + BINOP(-); +}; + +FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b +{ + BINOP(*); +} + +FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b +{ + BINOP(/); +} + + +FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c +{ + return AddSIMD( MulSIMD(a,b), c ); +} + +FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b +{ + return SubSIMD( c, MulSIMD(a,b) ); +}; + + +FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) +{ + fltx4 result; + SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); + SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); + SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); + SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); + return result; +} + +FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); +} + +FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); + SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); +} + +FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) +{ + fltx4 result; + SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); + SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); + SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); + SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); + return result; +} + +FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) +{ + fltx4 result; + SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); + SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); + SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); + SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); + return result; +} + +// tan^1(a/b) .. ie, pass sin in as a and cos in as b +FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 result; + SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return result; +} + +FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b +{ + IBINOP(&); +} + +FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 ); + SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 ); + SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 ); + SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 ); + return retVal; +} + +FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b +{ + IBINOP(^); +} + +FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b +{ + IBINOP(|); +} + +FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a +{ + fltx4 retval; + SubFloat( retval, 0 ) = -SubFloat( a, 0 ); + SubFloat( retval, 1 ) = -SubFloat( a, 1 ); + SubFloat( retval, 2 ) = -SubFloat( a, 2 ); + SubFloat( retval, 3 ) = -SubFloat( a, 3 ); + + return retval; +} + +FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? +{ + return ( SubFloat( a, 0 ) == 0.0 ) && + ( SubFloat( a, 1 ) == 0.0 ) && + ( SubFloat( a, 2 ) == 0.0 ) && + ( SubFloat( a, 3 ) == 0.0 ) ; +} + + +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) +{ + return SubFloat(a,0) > SubFloat(b,0) && + SubFloat(a,1) > SubFloat(b,1) && + SubFloat(a,2) > SubFloat(b,2) && + SubFloat(a,3) > SubFloat(b,3); +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) +{ + return SubFloat(a,0) >= SubFloat(b,0) && + SubFloat(a,1) >= SubFloat(b,1) && + SubFloat(a,2) >= SubFloat(b,2) && + SubFloat(a,3) >= SubFloat(b,3); +} + +// For branching if all a.xyzw == b.xyzw +FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) +{ + return SubFloat(a,0) == SubFloat(b,0) && + SubFloat(a,1) == SubFloat(b,1) && + SubFloat(a,2) == SubFloat(b,2) && + SubFloat(a,3) == SubFloat(b,3); +} + +FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set +{ + int nRet = 0; + + nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0 + nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1 + nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2 + nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3 + + return nRet; +} + +FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + return (0 != TestSignSIMD( a )); +} + +FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0; + return retVal; +} + + +FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) +{ + return OrSIMD( + AndSIMD( ReplacementMask, NewValue ), + AndNotSIMD( ReplacementMask, OldValue ) ); +} + +FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = flValue; + SubFloat( retVal, 1 ) = flValue; + SubFloat( retVal, 2 ) = flValue; + SubFloat( retVal, 3 ) = flValue; + return retVal; +} + +/// replicate a single 32 bit integer value to all 4 components of an m128 +FORCEINLINE fltx4 ReplicateIX4( int nValue ) +{ + fltx4 retVal; + SubInt( retVal, 0 ) = nValue; + SubInt( retVal, 1 ) = nValue; + SubInt( retVal, 2 ) = nValue; + SubInt( retVal, 3 ) = nValue; + return retVal; + +} + +// Round towards positive infinity +FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); + return retVal; + +} + +// Round towards negative infinity +FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) ); + return retVal; + +} + +FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON ); + SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON ); + SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON ); + SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); + SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); + SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); + SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); + SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); + SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); + SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); + return retVal; +} + +/// 1/x for all 4 values. +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); + SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); + SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); + SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); + SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); + SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); + SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); + return retVal; +} + +// 2^x for all values (the antilog) +FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) ); + SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) ); + SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) ); + SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) ); + + return retVal; +} + +FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) +{ + float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + + SubFloat( a, 1 ) * SubFloat( b, 1 ) + + SubFloat( a, 2 ) * SubFloat( b, 2 ); + return ReplicateX4( flDot ); +} + +FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) +{ + float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + + SubFloat( a, 1 ) * SubFloat( b, 1 ) + + SubFloat( a, 2 ) * SubFloat( b, 2 ) + + SubFloat( a, 3 ) * SubFloat( b, 3 ); + return ReplicateX4( flDot ); +} + +// Clamps the components of a vector to a specified minimum and maximum range. +FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) +{ + return MaxSIMD( min, MinSIMD( max, in ) ); +} + +// Squelch the w component of a vector to +0.0. +// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) +FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) +{ + fltx4 retval; + retval = a; + SubFloat( retval, 0 ) = 0; + return retval; +} + +FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) +{ + fltx4 retval = LoadAlignedSIMD(pSIMD.Base()); + // squelch w + SubInt( retval, 3 ) = 0; + return retval; +} + +FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) +{ + *pSIMD = SubFloat(a, 0); + *(pSIMD+1) = SubFloat(a, 1); + *(pSIMD+2) = SubFloat(a, 2); +} + +// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD +FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + StoreAlignedSIMD(pSIMD->Base(),a); +} + +FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) +{ +#define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; } + SWAP_FLOATS( x, 1, y, 0 ); + SWAP_FLOATS( x, 2, z, 0 ); + SWAP_FLOATS( x, 3, w, 0 ); + SWAP_FLOATS( y, 2, z, 1 ); + SWAP_FLOATS( y, 3, w, 1 ); + SWAP_FLOATS( z, 3, w, 2 ); +} + +// find the lowest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) +{ + float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); + return ReplicateX4(lowest); +} + +// find the highest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) +{ + float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); + return ReplicateX4(highest); +} + +// Fixed-point conversion and save as SIGNED INTS. +// pDest->x = Int (vSrc.x) +// note: some architectures have means of doing +// fixed point conversion when the fix depth is +// specified as an immediate.. but there is no way +// to guarantee an immediate as a parameter to function +// like this. +FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) +{ + (*pDest)[0] = SubFloat(vSrc, 0); + (*pDest)[1] = SubFloat(vSrc, 1); + (*pDest)[2] = SubFloat(vSrc, 2); + (*pDest)[3] = SubFloat(vSrc, 3); +} + +// ------------------------------------ +// INTEGER SIMD OPERATIONS. +// ------------------------------------ +// splat all components of a vector to a signed immediate int number. +FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue ) +{ + fltx4 retval; + SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue; + return retval; +} + +// Load 4 aligned words into a SIMD register +FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD) +{ + return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); +} + +// Load 4 unaligned words into a SIMD register +FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) +{ + return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); +} + +// save into four words, 16-byte aligned +FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; +} + +FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; +} + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) +{ + Assert(0); /* pc has no such operation */ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); + SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); + SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); + SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); + return retval; +} + + +#if 0 /* pc has no such op */ +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) ); + SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) ); + SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) ); + SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) ); + return retval; +} + + +/* + works on fltx4's as if they are four uints. + the first parameter contains the words to be shifted, + the second contains the amount to shift by AS INTS + + for i = 0 to 3 + shift = vSrcB_i*32:(i*32)+4 + vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift +*/ +FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) +{ + i32x4 retval; + SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); + SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); + SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); + SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); + + + return retval; +} +#endif + +#elif ( defined( _X360 ) ) + +//--------------------------------------------------------------------- +// X360 implementation +//--------------------------------------------------------------------- + +FORCEINLINE float & FloatSIMD( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_f32[idx]; +} + +FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_u32[idx]; +} + +FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) +{ + return __vaddfp( a, b ); +} + +FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b +{ + return __vsubfp( a, b ); +} + +FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b +{ + return __vmulfp( a, b ); +} + +FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c +{ + return __vmaddfp( a, b, c ); +} + +FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b +{ + return __vnmsubfp( a, b, c ); +}; + +FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) +{ + return __vmsum3fp( a, b ); +} + +FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) +{ + return __vmsum4fp( a, b ); +} + +FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) +{ + return XMVectorSin( radians ); +} + +FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + XMVectorSinCos( &sine, &cosine, radians ); +} + +FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + XMVectorSinCos( &sine, &cosine, radians ); +} + +FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians ) +{ + cosine = XMVectorCos( radians ); +} + +FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) +{ + return XMVectorASin( sine ); +} + +FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) +{ + return XMVectorACos( cs ); +} + +// tan^1(a/b) .. ie, pass sin in as a and cos in as b +FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) +{ + return XMVectorATan2( a, b ); +} + +// DivSIMD defined further down, since it uses ReciprocalSIMD + +FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) +{ + return __vmaxfp( a, b ); +} + +FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) +{ + return __vminfp( a, b ); +} + +FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b +{ + return __vand( a, b ); +} + +FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b +{ + // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second + return __vandc( b, a ); +} + +FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b +{ + return __vxor( a, b ); +} + +FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b +{ + return __vor( a, b ); +} + +FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a +{ + return XMVectorNegate(a); +} + +FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? +{ + unsigned int equalFlags = 0; + __vcmpeqfpR( a, Four_Zeros, &equalFlags ); + return XMComparisonAllTrue( equalFlags ); +} + +FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero? +{ + unsigned int conditionregister; + XMVectorEqualR(&conditionregister, a, XMVectorZero()); + return XMComparisonAnyTrue(conditionregister); +} + +FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero? +{ + // copy a's x component into w, in case w was zero. + fltx4 temp = __vrlimi(a, a, 1, 1); + unsigned int conditionregister; + XMVectorEqualR(&conditionregister, temp, XMVectorZero()); + return XMComparisonAnyTrue(conditionregister); +} + +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) +{ + unsigned int cr; + XMVectorGreaterR(&cr,a,b); + return XMComparisonAllTrue(cr); +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) +{ + unsigned int cr; + XMVectorGreaterOrEqualR(&cr,a,b); + return XMComparisonAllTrue(cr); +} + +// For branching if all a.xyzw == b.xyzw +FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) +{ + unsigned int cr; + XMVectorEqualR(&cr,a,b); + return XMComparisonAllTrue(cr); +} + + +FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set +{ + // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) + int nRet = 0; + + const fltx4_union & a_union = (const fltx4_union &)a; + nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0 + nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1 + nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2 + nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3 + + return nRet; +} + +// Squelch the w component of a vector to +0.0. +// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) +FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) +{ + return __vrlimi( a, __vzero(), 1, 0 ); +} + +FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + // NOTE: this tests the top bits of each vector element using integer math + // (so it ignores NaNs - it will return true for "-NaN") + unsigned int equalFlags = 0; + fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31) + signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 + __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags ); + return !XMComparisonAllTrue( equalFlags ); +} + +FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 +{ + return __vcmpeqfp( a, b ); +} + + +FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 +{ + return __vcmpgtfp( a, b ); +} + +FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 +{ + return __vcmpgefp( a, b ); +} + +FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 +{ + return __vcmpgtfp( b, a ); +} + +FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 +{ + return __vcmpgefp( b, a ); +} + +FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 +{ + return XMVectorInBounds( a, b ); +} + +// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue +FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) +{ + return __vsel( OldValue, NewValue, ReplacementMask ); +} + +// AKA "Broadcast", "Splat" +FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a +{ + // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + float * pValue = &flValue; + Assert( pValue ); + Assert( ((unsigned int)pValue & 3) == 0); + return __vspltw( __lvlx( pValue, 0 ), 0 ); +} + +FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a +{ + Assert( pValue ); + return __vspltw( __lvlx( pValue, 0 ), 0 ); +} + +/// replicate a single 32 bit integer value to all 4 components of an m128 +FORCEINLINE fltx4 ReplicateIX4( int nValue ) +{ + // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!) + int * pValue = &nValue; + Assert( pValue ); + Assert( ((unsigned int)pValue & 3) == 0); + return __vspltw( __lvlx( pValue, 0 ), 0 ); +} + +// Round towards positive infinity +FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) +{ + return __vrfip(a); +} + +// Round towards nearest integer +FORCEINLINE fltx4 RoundSIMD( const fltx4 &a ) +{ + return __vrfin(a); +} + +// Round towards negative infinity +FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) +{ + return __vrfim(a); +} + +FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less +{ + // This is emulated from rsqrt + return XMVectorSqrtEst( a ); +} + +FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) +{ + // This is emulated from rsqrt + return XMVectorSqrt( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less +{ + return __vrsqrtefp( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) +{ + // Convert zeros to epsilons + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + return ReciprocalSqrtEstSIMD( a_safe ); +} + +FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) +{ + // This uses Newton-Raphson to improve the HW result + return XMVectorReciprocalSqrt( a ); +} + +FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less +{ + return __vrefp( a ); +} + +/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. +/// No error checking! +FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a +{ + // This uses Newton-Raphson to improve the HW result + return XMVectorReciprocal( a ); +} + +// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?) +FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b +{ + return MulSIMD( ReciprocalSIMD( b ), a ); +} + +/// 1/x for all 4 values. +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) +{ + // Convert zeros to epsilons + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + return ReciprocalEstSIMD( a_safe ); +} + +FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) +{ + // Convert zeros to epsilons + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + return ReciprocalSIMD( a_safe ); + + // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does) + // fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a ); + // fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask ); + // return ReciprocalSIMD( a_safe ); +} + +// CHRISG: is it worth doing integer bitfiddling for this? +// 2^x for all values (the antilog) +FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) +{ + return XMVectorExp(toPower); +} + +// Clamps the components of a vector to a specified minimum and maximum range. +FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) +{ + return XMVectorClamp(in, min, max); +} + +FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) +{ + return XMLoadVector4( pSIMD ); +} + +// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). +FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) +{ + return XMLoadVector3( pSIMD ); +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) +{ + fltx4 out = XMLoadVector3A(pSIMD.Base()); + // squelch w + return __vrlimi( out, __vzero(), 1, 0 ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD ) +{ + fltx4 out = XMLoadVector3A(pSIMD); + // squelch w + return __vrlimi( out, __vzero(), 1, 0 ); +} + +FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) +{ + XMStoreVector4( pSIMD, a ); +} + +FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) +{ + XMStoreVector3( pSIMD, a ); +} + + +// strongly typed -- for typechecking as we transition to SIMD +FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + XMStoreVector3A(pSIMD->Base(),a); +} + + +// Fixed-point conversion and save as SIGNED INTS. +// pDest->x = Int (vSrc.x) +// note: some architectures have means of doing +// fixed point conversion when the fix depth is +// specified as an immediate.. but there is no way +// to guarantee an immediate as a parameter to function +// like this. +FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) +{ + fltx4 asInt = __vctsxs( vSrc, 0 ); + XMStoreVector4A(pDest->Base(), asInt); +} + +FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) +{ + XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w ); + xyzwMatrix = XMMatrixTranspose( xyzwMatrix ); + x = xyzwMatrix.r[0]; + y = xyzwMatrix.r[1]; + z = xyzwMatrix.r[2]; + w = xyzwMatrix.r[3]; +} + +// Return one in the fastest way -- faster even than loading. +FORCEINLINE fltx4 LoadZeroSIMD( void ) +{ + return XMVectorZero(); +} + +// Return one in the fastest way -- faster even than loading. +FORCEINLINE fltx4 LoadOneSIMD( void ) +{ + return XMVectorSplatOne(); +} + +FORCEINLINE fltx4 SplatXSIMD( fltx4 a ) +{ + return XMVectorSplatX( a ); +} + +FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) +{ + return XMVectorSplatY( a ); +} + +FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) +{ + return XMVectorSplatZ( a ); +} + +FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) +{ + return XMVectorSplatW( a ); +} + +FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) +{ + fltx4 result = __vrlimi(a, x, 8, 0); + return result; +} + +FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) +{ + fltx4 result = __vrlimi(a, y, 4, 0); + return result; +} + +FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) +{ + fltx4 result = __vrlimi(a, z, 2, 0); + return result; +} + +FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) +{ + fltx4 result = __vrlimi(a, w, 1, 0); + return result; +} + +FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) +{ + static int s_nVrlimiMask[4] = { 8, 4, 2, 1 }; + fltx4 val = ReplicateX4( flValue ); + fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0); + return result; +} + +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) +{ + fltx4 compareOne = a; + return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 ); +} + +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) +{ + fltx4 compareOne = a; + return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 ); +} + + + +// find the lowest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +// ignores a.w. +// Though this is only five instructions long, +// they are all dependent, making this stall city. +// Forcing this inline should hopefully help with scheduling. +FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = a ; + compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); + // compareOne is [y,z,G,G] + fltx4 retval = MinSIMD( a, compareOne ); + // retVal is [min(x,y), min(y,z), G, G] + compareOne = __vrlimi( compareOne, a, 8 , 2); + // compareOne is [z, G, G, G] + retval = MinSIMD( retval, compareOne ); + // retVal = [ min(min(x,y),z), G, G, G ] + + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); +} + +// find the highest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +// ignores a.w. +// Though this is only five instructions long, +// they are all dependent, making this stall city. +// Forcing this inline should hopefully help with scheduling. +FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = a ; + compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); + // compareOne is [y,z,G,G] + fltx4 retval = MaxSIMD( a, compareOne ); + // retVal is [max(x,y), max(y,z), G, G] + compareOne = __vrlimi( compareOne, a, 8 , 2); + // compareOne is [z, G, G, G] + retval = MaxSIMD( retval, compareOne ); + // retVal = [ max(max(x,y),z), G, G, G ] + + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); +} + + +// Transform many (horizontal) points in-place by a 3x4 matrix, +// here already loaded onto three fltx4 registers. +// The points must be stored as 16-byte aligned. They are points +// and not vectors because we assume the w-component to be 1. +// To spare yourself the annoyance of loading the matrix yourself, +// use one of the overloads below. +void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3); + +// Transform many (horizontal) points in-place by a 3x4 matrix. +// The points must be stored as 16-byte aligned. They are points +// and not vectors because we assume the w-component to be 1. +// In this function, the matrix need not be aligned. +FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) +{ + return TransformManyPointsBy(pVectors, numVectors, + LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) ); +} + +// Transform many (horizontal) points in-place by a 3x4 matrix. +// The points must be stored as 16-byte aligned. They are points +// and not vectors because we assume the w-component to be 1. +// In this function, the matrix must itself be aligned on a 16-byte +// boundary. +FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) +{ + return TransformManyPointsBy(pVectors, numVectors, + LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) ); +} + +// ------------------------------------ +// INTEGER SIMD OPERATIONS. +// ------------------------------------ + +// Load 4 aligned words into a SIMD register +FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) +{ + return XMLoadVector4A(pSIMD); +} + +// Load 4 unaligned words into a SIMD register +FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD) +{ + return XMLoadVector4( pSIMD ); +} + +// save into four words, 16-byte aligned +FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; +} + +FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + XMStoreVector4(pSIMD, a); +} + + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + return __vcfux( vSrcA, 0 ); +} + + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + return __vcfsx( vSrcA, 0 ); +} + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. Each uint +// will be divided by 2^immed after conversion +// (eg, this is fixed point math). +/* as if: + FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) + { + return __vcfux( vSrcA, uImmed ); + } +*/ +#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) )) + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. Each int +// will be divided by 2^immed (eg, this is fixed point +// math). +/* as if: + FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) + { + return __vcfsx( vSrcA, uImmed ); + } +*/ +#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) )) + +// set all components of a vector to a signed immediate int number. +/* as if: + FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate) + { + return __vspltisw( toImmediate ); + } +*/ +#define IntSetImmediateSIMD(x) (__vspltisw(x)) + +/* + works on fltx4's as if they are four uints. + the first parameter contains the words to be shifted, + the second contains the amount to shift by AS INTS + + for i = 0 to 3 + shift = vSrcB_i*32:(i*32)+4 + vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift +*/ +FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB) +{ + return __vslw(vSrcA, vSrcB); +} + +FORCEINLINE float SubFloat( const fltx4 & a, int idx ) +{ + // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + const fltx4_union & a_union = (const fltx4_union &)a; + return a_union.m128_f32[ idx ]; +} + +FORCEINLINE float & SubFloat( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_f32[idx]; +} + +FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) +{ + fltx4 t = __vctuxs( a, 0 ); + const fltx4_union & a_union = (const fltx4_union &)t; + return a_union.m128_u32[idx]; +} + + +FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) +{ + const fltx4_union & a_union = (const fltx4_union &)a; + return a_union.m128_u32[idx]; +} + +FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_u32[idx]; +} + +#else + +//--------------------------------------------------------------------- +// Intel/SSE implementation +//--------------------------------------------------------------------- + +FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_store_ps( pSIMD, a ); +} + +FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_storeu_ps( pSIMD, a ); +} + + +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ); +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ); + +FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) +{ + _mm_store_ss(pSIMD, a); + _mm_store_ss(pSIMD+1, RotateLeft(a)); + _mm_store_ss(pSIMD+2, RotateLeft2(a)); +} + +// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD +FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + StoreAlignedSIMD( pSIMD->Base(),a ); +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) +{ + return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) ); +} + +FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b +{ + return _mm_and_ps( a, b ); +} + +FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b +{ + return _mm_andnot_ps( a, b ); +} + +FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b +{ + return _mm_xor_ps( a, b ); +} + +FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b +{ + return _mm_or_ps( a, b ); +} + +// Squelch the w component of a vector to +0.0. +// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) +FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) +{ + return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) +{ + return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) ); +} + +FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) +{ + return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); +} + +FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) +{ + return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); +} + +/// replicate a single 32 bit integer value to all 4 components of an m128 +FORCEINLINE fltx4 ReplicateIX4( int i ) +{ + fltx4 value = _mm_set_ss( * ( ( float *) &i ) );; + return _mm_shuffle_ps( value, value, 0); +} + + +FORCEINLINE fltx4 ReplicateX4( float flValue ) +{ + __m128 value = _mm_set_ss( flValue ); + return _mm_shuffle_ps( value, value, 0 ); +} + + +FORCEINLINE float SubFloat( const fltx4 & a, int idx ) +{ + // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) +#ifndef POSIX + return a.m128_f32[ idx ]; +#else + return (reinterpret_cast<float const *>(&a))[idx]; +#endif +} + +FORCEINLINE float & SubFloat( fltx4 & a, int idx ) +{ +#ifndef POSIX + return a.m128_f32[ idx ]; +#else + return (reinterpret_cast<float *>(&a))[idx]; +#endif +} + +FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) +{ + return (uint32)SubFloat(a,idx); +} + +FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) +{ +#ifndef POSIX + return a.m128_u32[idx]; +#else + return (reinterpret_cast<uint32 const *>(&a))[idx]; +#endif +} + +FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) +{ +#ifndef POSIX + return a.m128_u32[idx]; +#else + return (reinterpret_cast<uint32 *>(&a))[idx]; +#endif +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadZeroSIMD( void ) +{ + return Four_Zeros; +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadOneSIMD( void ) +{ + return Four_Ones; +} + +FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) +{ + return OrSIMD( + AndSIMD( ReplacementMask, NewValue ), + AndNotSIMD( ReplacementMask, OldValue ) ); +} + +// remember, the SSE numbers its words 3 2 1 0 +// The way we want to specify shuffles is backwards from the default +// MM_SHUFFLE_REV is in array index order (default is reversed) +#define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a) + +FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) ); +} + +FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) ); +} + +FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) ); +} + +FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a ) +{ + return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 3, 3, 3 ) ); +} + +FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a ); + return result; +} + +FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a ); + return result; +} + +FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a ); + return result; +} + +FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a ); + return result; +} + +FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) +{ + fltx4 val = ReplicateX4( flValue ); + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a ); + return result; +} + +// a b c d -> b c d a +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) ); +} + +// a b c d -> c d a b +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) ); +} + +// a b c d -> d a b c +FORCEINLINE fltx4 RotateRight( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 0, 3, 2, 1) ); +} + +// a b c d -> c d a b +FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) ); +} + + +FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b +{ + return _mm_add_ps( a, b ); +}; + +FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b +{ + return _mm_sub_ps( a, b ); +}; + +FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b +{ + return _mm_mul_ps( a, b ); +}; + +FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b +{ + return _mm_div_ps( a, b ); +}; + +FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c +{ + return AddSIMD( MulSIMD(a,b), c ); +} + +FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b +{ + return SubSIMD( c, MulSIMD(a,b) ); +}; + +FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 m = MulSIMD( a, b ); + float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ); + return ReplicateX4( flDot ); +} + +FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 m = MulSIMD( a, b ); + float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 ); + return ReplicateX4( flDot ); +} + +//TODO: implement as four-way Taylor series (see xbox implementation) +FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) +{ + fltx4 result; + SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); + SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); + SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); + SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); + return result; +} + +FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + // FIXME: Make a fast SSE version + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); +} + +FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c +{ + // FIXME: Make a fast SSE version + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); + SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); +} + +//TODO: implement as four-way Taylor series (see xbox implementation) +FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) +{ + // FIXME: Make a fast SSE version + fltx4 result; + SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); + SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); + SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); + SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); + return result; +} + +FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) +{ + fltx4 result; + SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); + SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); + SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); + SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); + return result; +} + +// tan^1(a/b) .. ie, pass sin in as a and cos in as b +FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 result; + SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return result; +} + +FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a +{ + return SubSIMD(LoadZeroSIMD(),a); +} + +FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set +{ + return _mm_movemask_ps( a ); +} + +FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + return (0 != TestSignSIMD( a )); +} + +FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 +{ + return _mm_cmpeq_ps( a, b ); +} + +FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 +{ + return _mm_cmpgt_ps( a, b ); +} + +FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 +{ + return _mm_cmpge_ps( a, b ); +} + +FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 +{ + return _mm_cmplt_ps( a, b ); +} + +FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 +{ + return _mm_cmple_ps( a, b ); +} + +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) +{ + return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0; +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) +{ + return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0; +} + +// For branching if all a.xyzw == b.xyzw +FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) +{ + return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf; +} + +FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 +{ + return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ); +} + +FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) +{ + return _mm_min_ps( a, b ); +} + +FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) +{ + return _mm_max_ps( a, b ); +} + + + +// SSE lacks rounding operations. +// Really. +// You can emulate them by setting the rounding mode for the +// whole processor and then converting to int, and then back again. +// But every time you set the rounding mode, you clear out the +// entire pipeline. So, I can't do them per operation. You +// have to do it once, before the loop that would call these. +// Round towards positive infinity +FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); + return retVal; + +} + +fltx4 fabs( const fltx4 & x ); +// Round towards negative infinity +// This is the implementation that was here before; it assumes +// you are in round-to-floor mode, which I guess is usually the +// case for us vis-a-vis SSE. It's totally unnecessary on +// VMX, which has a native floor op. +FORCEINLINE fltx4 FloorSIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); + return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits +} + + + +inline bool IsAllZeros( const fltx4 & var ) +{ + return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF; +} + +FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less +{ + return _mm_sqrt_ps( a ); +} + +FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) +{ + return _mm_sqrt_ps( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less +{ + return _mm_rsqrt_ps( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + ret = ReciprocalSqrtEstSIMD( ret ); + return ret; +} + +/// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD +FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) +{ + fltx4 guess = ReciprocalSqrtEstSIMD( a ); + // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2)); + guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess )))); + guess = MulSIMD( Four_PointFives, guess); + return guess; +} + +FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less +{ + return _mm_rcp_ps( a ); +} + +/// 1/x for all 4 values, more or less +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + ret = ReciprocalEstSIMD( ret ); + return ret; +} + +/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. +/// No error checking! +FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a +{ + fltx4 ret = ReciprocalEstSIMD( a ); + // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2 + ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) ); + return ret; +} + +/// 1/x for all 4 values. +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) +{ + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + ret = ReciprocalSIMD( ret ); + return ret; +} + +// CHRISG: is it worth doing integer bitfiddling for this? +// 2^x for all values (the antilog) +FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) ); + SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) ); + SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) ); + SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) ); + + return retval; +} + +// Clamps the components of a vector to a specified minimum and maximum range. +FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) +{ + return MaxSIMD( min, MinSIMD( max, in ) ); +} + +FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w) +{ + _MM_TRANSPOSE4_PS( x, y, z, w ); +} + +FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = RotateLeft( a ); + // compareOne is [y,z,G,x] + fltx4 retval = MinSIMD( a, compareOne ); + // retVal is [min(x,y), ... ] + compareOne = RotateLeft2( a ); + // compareOne is [z, G, x, y] + retval = MinSIMD( retval, compareOne ); + // retVal = [ min(min(x,y),z)..] + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); + +} + +FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = RotateLeft( a ); + // compareOne is [y,z,G,x] + fltx4 retval = MaxSIMD( a, compareOne ); + // retVal is [max(x,y), ... ] + compareOne = RotateLeft2( a ); + // compareOne is [z, G, x, y] + retval = MaxSIMD( retval, compareOne ); + // retVal = [ max(max(x,y),z)..] + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); + +} + +// ------------------------------------ +// INTEGER SIMD OPERATIONS. +// ------------------------------------ + + +#if 0 /* pc does not have these ops */ +// splat all components of a vector to a signed immediate int number. +FORCEINLINE fltx4 IntSetImmediateSIMD(int to) +{ + //CHRISG: SSE2 has this, but not SSE1. What to do? + fltx4 retval; + SubInt( retval, 0 ) = to; + SubInt( retval, 1 ) = to; + SubInt( retval, 2 ) = to; + SubInt( retval, 3 ) = to; + return retval; +} +#endif + +// Load 4 aligned words into a SIMD register +FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) +{ + return _mm_load_ps( reinterpret_cast<const float *>(pSIMD) ); +} + +// Load 4 unaligned words into a SIMD register +FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) +{ + return _mm_loadu_ps( reinterpret_cast<const float *>(pSIMD) ); +} + +// save into four words, 16-byte aligned +FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_store_ps( reinterpret_cast<float *>(pSIMD), a ); +} + +FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) +{ + _mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a ); +} + +FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a ); +} + + +// CHRISG: the conversion functions all seem to operate on m64's only... +// how do we make them work here? + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); + SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); + SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); + SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); + return retval; +} + + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0])); + SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1])); + SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2])); + SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3])); + return retval; +} + +/* + works on fltx4's as if they are four uints. + the first parameter contains the words to be shifted, + the second contains the amount to shift by AS INTS + + for i = 0 to 3 + shift = vSrcB_i*32:(i*32)+4 + vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift +*/ +FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) +{ + i32x4 retval; + SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); + SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); + SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); + SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); + + + return retval; +} + + +// Fixed-point conversion and save as SIGNED INTS. +// pDest->x = Int (vSrc.x) +// note: some architectures have means of doing +// fixed point conversion when the fix depth is +// specified as an immediate.. but there is no way +// to guarantee an immediate as a parameter to function +// like this. +FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) +{ + __m64 bottom = _mm_cvttps_pi32( vSrc ); + __m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) ); + + *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom; + *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top; + + _mm_empty(); +} + + + +#endif + + + +/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are +/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated. +class ALIGN16 FourVectors +{ +public: + fltx4 x, y, z; + + FORCEINLINE void DuplicateVector(Vector const &v) //< set all 4 vectors to the same vector value + { + x=ReplicateX4(v.x); + y=ReplicateX4(v.y); + z=ReplicateX4(v.z); + } + + FORCEINLINE fltx4 const & operator[](int idx) const + { + return *((&x)+idx); + } + + FORCEINLINE fltx4 & operator[](int idx) + { + return *((&x)+idx); + } + + FORCEINLINE void operator+=(FourVectors const &b) //< add 4 vectors to another 4 vectors + { + x=AddSIMD(x,b.x); + y=AddSIMD(y,b.y); + z=AddSIMD(z,b.z); + } + + FORCEINLINE void operator-=(FourVectors const &b) //< subtract 4 vectors from another 4 + { + x=SubSIMD(x,b.x); + y=SubSIMD(y,b.y); + z=SubSIMD(z,b.z); + } + + FORCEINLINE void operator*=(FourVectors const &b) //< scale all four vectors per component scale + { + x=MulSIMD(x,b.x); + y=MulSIMD(y,b.y); + z=MulSIMD(z,b.z); + } + + FORCEINLINE void operator*=(const fltx4 & scale) //< scale + { + x=MulSIMD(x,scale); + y=MulSIMD(y,scale); + z=MulSIMD(z,scale); + } + + FORCEINLINE void operator*=(float scale) //< uniformly scale all 4 vectors + { + fltx4 scalepacked = ReplicateX4(scale); + *this *= scalepacked; + } + + FORCEINLINE fltx4 operator*(FourVectors const &b) const //< 4 dot products + { + fltx4 dot=MulSIMD(x,b.x); + dot=MaddSIMD(y,b.y,dot); + dot=MaddSIMD(z,b.z,dot); + return dot; + } + + FORCEINLINE fltx4 operator*(Vector const &b) const //< dot product all 4 vectors with 1 vector + { + fltx4 dot=MulSIMD(x,ReplicateX4(b.x)); + dot=MaddSIMD(y,ReplicateX4(b.y), dot); + dot=MaddSIMD(z,ReplicateX4(b.z), dot); + return dot; + } + + FORCEINLINE void VProduct(FourVectors const &b) //< component by component mul + { + x=MulSIMD(x,b.x); + y=MulSIMD(y,b.y); + z=MulSIMD(z,b.z); + } + FORCEINLINE void MakeReciprocal(void) //< (x,y,z)=(1/x,1/y,1/z) + { + x=ReciprocalSIMD(x); + y=ReciprocalSIMD(y); + z=ReciprocalSIMD(z); + } + + FORCEINLINE void MakeReciprocalSaturate(void) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23 + { + x=ReciprocalSaturateSIMD(x); + y=ReciprocalSaturateSIMD(y); + z=ReciprocalSaturateSIMD(z); + } + + // Assume the given matrix is a rotation, and rotate these vectors by it. + // If you have a long list of FourVectors structures that you all want + // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. + inline void RotateBy(const matrix3x4_t& matrix); + + /// You can use this to rotate a long array of FourVectors all by the same + /// matrix. The first parameter is the head of the array. The second is the + /// number of vectors to rotate. The third is the matrix. + static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); + + /// Assume the vectors are points, and transform them in place by the matrix. + inline void TransformBy(const matrix3x4_t& matrix); + + /// You can use this to Transform a long array of FourVectors all by the same + /// matrix. The first parameter is the head of the array. The second is the + /// number of vectors to rotate. The third is the matrix. The fourth is the + /// output buffer, which must not overlap the pVectors buffer. This is not + /// an in-place transformation. + static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ); + + /// You can use this to Transform a long array of FourVectors all by the same + /// matrix. The first parameter is the head of the array. The second is the + /// number of vectors to rotate. The third is the matrix. The fourth is the + /// output buffer, which must not overlap the pVectors buffer. + /// This is an in-place transformation. + static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); + + // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector. + FORCEINLINE const float & X(int idx) const + { + // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + return SubFloat( (fltx4 &)x, idx ); + } + + FORCEINLINE const float & Y(int idx) const + { + return SubFloat( (fltx4 &)y, idx ); + } + + FORCEINLINE const float & Z(int idx) const + { + return SubFloat( (fltx4 &)z, idx ); + } + + FORCEINLINE float & X(int idx) + { + return SubFloat( x, idx ); + } + + FORCEINLINE float & Y(int idx) + { + return SubFloat( y, idx ); + } + + FORCEINLINE float & Z(int idx) + { + return SubFloat( z, idx ); + } + + FORCEINLINE Vector Vec(int idx) const //< unpack one of the vectors + { + return Vector( X(idx), Y(idx), Z(idx) ); + } + + FourVectors(void) + { + } + + FourVectors( FourVectors const &src ) + { + x=src.x; + y=src.y; + z=src.z; + } + + FORCEINLINE void operator=( FourVectors const &src ) + { + x=src.x; + y=src.y; + z=src.z; + } + + /// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op + FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d) + { + // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360 + // use an unfolded implementation here +#if _X360 + fltx4 tx = LoadUnalignedSIMD( &a.x ); + fltx4 ty = LoadUnalignedSIMD( &b.x ); + fltx4 tz = LoadUnalignedSIMD( &c.x ); + fltx4 tw = LoadUnalignedSIMD( &d.x ); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); +#else + x = LoadUnalignedSIMD( &( a.x )); + y = LoadUnalignedSIMD( &( b.x )); + z = LoadUnalignedSIMD( &( c.x )); + fltx4 w = LoadUnalignedSIMD( &( d.x )); + // now, matrix is: + // x y z ? + // x y z ? + // x y z ? + // x y z ? + TransposeSIMD(x, y, z, w); +#endif + } + + /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op. + /// all 4 vectors must be 128 bit boundary + FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d) + { +#if _X360 + fltx4 tx = LoadAlignedSIMD(a); + fltx4 ty = LoadAlignedSIMD(b); + fltx4 tz = LoadAlignedSIMD(c); + fltx4 tw = LoadAlignedSIMD(d); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); +#else + x = LoadAlignedSIMD( a ); + y = LoadAlignedSIMD( b ); + z = LoadAlignedSIMD( c ); + fltx4 w = LoadAlignedSIMD( d ); + // now, matrix is: + // x y z ? + // x y z ? + // x y z ? + // x y z ? + TransposeSIMD( x, y, z, w ); +#endif + } + + FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d) + { + LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x ); + } + + /// return the squared length of all 4 vectors + FORCEINLINE fltx4 length2(void) const + { + return (*this)*(*this); + } + + /// return the approximate length of all 4 vectors. uses the sqrt approximation instruction + FORCEINLINE fltx4 length(void) const + { + return SqrtEstSIMD(length2()); + } + + /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction) + FORCEINLINE void VectorNormalizeFast(void) + { + fltx4 mag_sq=(*this)*(*this); // length^2 + (*this) *= ReciprocalSqrtEstSIMD(mag_sq); // *(1.0/sqrt(length^2)) + } + + /// normalize all 4 vectors in place. + FORCEINLINE void VectorNormalize(void) + { + fltx4 mag_sq=(*this)*(*this); // length^2 + (*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2)) + } + + /// construct a FourVectors from 4 separate Vectors + FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d) + { + LoadAndSwizzle(a,b,c,d); + } + + /// construct a FourVectors from 4 separate Vectors + FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d) + { + LoadAndSwizzleAligned(a,b,c,d); + } + + FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt ) + { + fltx4 fl4dX = SubSIMD( pnt.x, x ); + fltx4 fl4dY = SubSIMD( pnt.y, y ); + fltx4 fl4dZ = SubSIMD( pnt.z, z ); + return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) ); + + } + + FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const + { + FourVectors lineDelta = p1; + lineDelta -= p0; + fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 ); + FourVectors v4OurPnt = *this; + v4OurPnt -= p0; + return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); + } + + FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const + { + FourVectors lineDelta = p1; + FourVectors v4OurPnt = *this; + v4OurPnt -= p0; + lineDelta -= p0; + + fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta ); + + fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); + + fl4T = MinSIMD( fl4T, Four_Ones ); + fl4T = MaxSIMD( fl4T, Four_Zeros ); + lineDelta *= fl4T; + return v4OurPnt.DistToSqr( lineDelta ); + } + +}; + +/// form 4 cross products +inline FourVectors operator ^(const FourVectors &a, const FourVectors &b) +{ + FourVectors ret; + ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y)); + ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z)); + ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x)); + return ret; +} + +/// component-by-componentwise MAX operator +inline FourVectors maximum(const FourVectors &a, const FourVectors &b) +{ + FourVectors ret; + ret.x=MaxSIMD(a.x,b.x); + ret.y=MaxSIMD(a.y,b.y); + ret.z=MaxSIMD(a.z,b.z); + return ret; +} + +/// component-by-componentwise MIN operator +inline FourVectors minimum(const FourVectors &a, const FourVectors &b) +{ + FourVectors ret; + ret.x=MinSIMD(a.x,b.x); + ret.y=MinSIMD(a.y,b.y); + ret.z=MinSIMD(a.z,b.z); + return ret; +} + +/// calculate reflection vector. incident and normal dir assumed normalized +FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal ) +{ + FourVectors ret = incident; + fltx4 iDotNx2 = incident * normal; + iDotNx2 = AddSIMD( iDotNx2, iDotNx2 ); + FourVectors nPart = normal; + nPart *= iDotNx2; + ret -= nPart; // i-2(n*i)n + return ret; +} + +/// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector. +FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal ) +{ + FourVectors ret = incident; + fltx4 iDotN = incident * normal; + FourVectors nPart = normal; + nPart *= iDotN; + ret -= nPart; // i-(n*i)n + return ret; +} + + +// Assume the given matrix is a rotation, and rotate these vectors by it. +// If you have a long list of FourVectors structures that you all want +// to rotate by the same matrix, use FourVectors::RotateManyBy() instead. +void FourVectors::RotateBy(const matrix3x4_t& matrix) +{ + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, + matSplat10, matSplat11, matSplat12, + matSplat20, matSplat21, matSplat22; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); + fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); + fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); + + matSplat00 = SplatXSIMD( matCol0 ); + matSplat01 = SplatYSIMD( matCol0 ); + matSplat02 = SplatZSIMD( matCol0 ); + + matSplat10 = SplatXSIMD( matCol1 ); + matSplat11 = SplatYSIMD( matCol1 ); + matSplat12 = SplatZSIMD( matCol1 ); + + matSplat20 = SplatXSIMD( matCol2 ); + matSplat21 = SplatYSIMD( matCol2 ); + matSplat22 = SplatZSIMD( matCol2 ); + } + + // Trust in the compiler to schedule these operations correctly: + fltx4 outX, outY, outZ; + outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) ); + outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) ); + outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) ); + + x = outX; + y = outY; + z = outZ; +} + +// Assume the given matrix is a rotation, and rotate these vectors by it. +// If you have a long list of FourVectors structures that you all want +// to rotate by the same matrix, use FourVectors::RotateManyBy() instead. +void FourVectors::TransformBy(const matrix3x4_t& matrix) +{ + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, + matSplat10, matSplat11, matSplat12, + matSplat20, matSplat21, matSplat22; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); + fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); + fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); + + matSplat00 = SplatXSIMD( matCol0 ); + matSplat01 = SplatYSIMD( matCol0 ); + matSplat02 = SplatZSIMD( matCol0 ); + + matSplat10 = SplatXSIMD( matCol1 ); + matSplat11 = SplatYSIMD( matCol1 ); + matSplat12 = SplatZSIMD( matCol1 ); + + matSplat20 = SplatXSIMD( matCol2 ); + matSplat21 = SplatYSIMD( matCol2 ); + matSplat22 = SplatZSIMD( matCol2 ); + } + + // Trust in the compiler to schedule these operations correctly: + fltx4 outX, outY, outZ; + + outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) ); + outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) ); + outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) ); + + x = AddSIMD( outX, ReplicateX4( matrix[0][3] )); + y = AddSIMD( outY, ReplicateX4( matrix[1][3] )); + z = AddSIMD( outZ, ReplicateX4( matrix[2][3] )); +} + + + +/// quick, low quality perlin-style noise() function suitable for real time use. +/// return value is -1..1. Only reliable around +/- 1 million or so. +fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z ); +fltx4 NoiseSIMD( FourVectors const &v ); + +// vector valued noise direction +FourVectors DNoiseSIMD( FourVectors const &v ); + +// vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html +FourVectors CurlNoiseSIMD( FourVectors const &v ); + + +/// calculate the absolute value of a packed single +inline fltx4 fabs( const fltx4 & x ) +{ + return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) ); +} + +/// negate all four components of a SIMD packed single +inline fltx4 fnegate( const fltx4 & x ) +{ + return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) ); +} + + +fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent); + +// PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some +// restictions: fractional exponents are only handled with 2 bits of precision. Basically, +// fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25). +// negative and fractional powers are handled by the SIMD reciprocal and square root approximation +// instructions and so are not especially accurate ----Note that this routine does not raise +// numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)). +inline fltx4 PowSIMD( const fltx4 & x, float exponent ) +{ + return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent)); +} + + + +// random number generation - generate 4 random numbers quickly. + +void SeedRandSIMD(uint32 seed); // seed the random # generator +fltx4 RandSIMD( int nContext = 0 ); // return 4 numbers in the 0..1 range + +// for multithreaded, you need to use these and use the argument form of RandSIMD: +int GetSIMDRandContext( void ); +void ReleaseSIMDRandContext( int nContext ); + +FORCEINLINE fltx4 RandSignedSIMD( void ) // -1..1 +{ + return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones ); +} + + +// SIMD versions of mathlib simplespline functions +// hermite basis function for smooth interpolation +// Similar to Gain() above, but very cheap to call +// value should be between 0 & 1 inclusive +inline fltx4 SimpleSpline( const fltx4 & value ) +{ + // Arranged to avoid a data dependency between these two MULs: + fltx4 valueDoubled = MulSIMD( value, Four_Twos ); + fltx4 valueSquared = MulSIMD( value, value ); + + // Nice little ease-in, ease-out spline-like curve + return SubSIMD( + MulSIMD( Four_Threes, valueSquared ), + MulSIMD( valueDoubled, valueSquared ) ); +} + +// remaps a value in [startInterval, startInterval+rangeInterval] from linear to +// spline using SimpleSpline +inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val, + const fltx4 & A, const fltx4 & BMinusA, + const fltx4 & OneOverBMinusA, const fltx4 & C, + const fltx4 & DMinusC ) +{ +// if ( A == B ) +// return val >= B ? D : C; + fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); + return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); +} + +inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val, + const fltx4 & A, const fltx4 & BMinusA, + const fltx4 & OneOverBMinusA, const fltx4 & C, + const fltx4 & DMinusC ) +{ +// if ( A == B ) +// return val >= B ? D : C; + fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); + cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) ); + return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); +} + +FORCEINLINE fltx4 FracSIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); + return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits +} + +FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival ); + return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits +} + +FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val ) +{ + fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival ); + return SubSIMD( val, ival ); +} + + +// approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi. +// no range reduction is done - for values outside of 0..1 you won't like the results +FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val ) +{ + // really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between. + // sufficient for simple oscillation. + return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); +} + +FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val ) +{ + // not a bad approximation : parabola always over-estimates. Squared parabola always + // underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin) + fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); + return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst ); +} + +// full range useable implementations +FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); + fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); + fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); + fltx4 fl4Sin = _SinEst01SIMD( fl4val ); + fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); + return fl4Sin; + +} + +FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); + fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); + fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); + fltx4 fl4Sin = _Sin01SIMD( fl4val ); + fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); + return fl4Sin; + +} + +// Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1) + +FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter ) +{ + // convert perlin-style-bias parameter to the value right for the approximation + return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos ); +} + +FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param ) +{ + // similar to bias function except pass precalced bias value from calling PreCalcBiasParameter. + + //!!speed!! use reciprocal est? + //!!speed!! could save one op by precalcing _2_ values + return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) ); +} + +//----------------------------------------------------------------------------- +// Box/plane test +// NOTE: The w component of emins + emaxs must be 1 for this to work +//----------------------------------------------------------------------------- +FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f ) +{ + fltx4 corners[2]; + fltx4 normal = LoadUnalignedSIMD( p->normal.Base() ); + fltx4 dist = ReplicateX4( -p->dist ); + normal = SetWSIMD( normal, dist ); + fltx4 t4 = ReplicateX4( tolerance ); + fltx4 negt4 = ReplicateX4( -tolerance ); + fltx4 cmp = CmpGeSIMD( normal, Four_Zeros ); + corners[0] = MaskedAssign( cmp, emaxs, emins ); + corners[1] = MaskedAssign( cmp, emins, emaxs ); + fltx4 dot1 = Dot4SIMD( normal, corners[0] ); + fltx4 dot2 = Dot4SIMD( normal, corners[1] ); + cmp = CmpGeSIMD( dot1, t4 ); + fltx4 cmp2 = CmpGtSIMD( negt4, dot2 ); + fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros ); + fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros ); + result = AddSIMD( result, result2 ); + intx4 sides; + ConvertStoreAsIntsSIMD( &sides, result ); + return sides[0]; +} + +#endif // _ssemath_h |