aboutsummaryrefslogtreecommitdiff
path: root/sp/src/mathlib/sse.cpp
diff options
context:
space:
mode:
authorJørgen P. Tjernø <[email protected]>2013-12-02 19:31:46 -0800
committerJørgen P. Tjernø <[email protected]>2013-12-02 19:46:31 -0800
commitf56bb35301836e56582a575a75864392a0177875 (patch)
treede61ddd39de3e7df52759711950b4c288592f0dc /sp/src/mathlib/sse.cpp
parentMark some more files as text. (diff)
downloadsource-sdk-2013-f56bb35301836e56582a575a75864392a0177875.tar.xz
source-sdk-2013-f56bb35301836e56582a575a75864392a0177875.zip
Fix line endings. WHAMMY.
Diffstat (limited to 'sp/src/mathlib/sse.cpp')
-rw-r--r--sp/src/mathlib/sse.cpp2190
1 files changed, 1095 insertions, 1095 deletions
diff --git a/sp/src/mathlib/sse.cpp b/sp/src/mathlib/sse.cpp
index 6e41683f..a2445c79 100644
--- a/sp/src/mathlib/sse.cpp
+++ b/sp/src/mathlib/sse.cpp
@@ -1,1095 +1,1095 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: SSE Math primitives.
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h> // Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "sse.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-#ifndef COMPILER_MSVC64
-// Implement for 64-bit Windows if needed.
-
-static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 };
-static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
-
-//-----------------------------------------------------------------------------
-// Macros and constants required by some of the SSE assembly:
-//-----------------------------------------------------------------------------
-
-#ifdef _WIN32
- #define _PS_EXTERN_CONST(Name, Val) \
- const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-
- #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
- const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
-
- #define _EPI32_CONST(Name, Val) \
- static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
-
- #define _PS_CONST(Name, Val) \
- static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-#elif POSIX
- #define _PS_EXTERN_CONST(Name, Val) \
- const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
-
- #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
- const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
-
- #define _EPI32_CONST(Name, Val) \
- static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
-
- #define _PS_CONST(Name, Val) \
- static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
-#endif
-
-_PS_EXTERN_CONST(am_0, 0.0f);
-_PS_EXTERN_CONST(am_1, 1.0f);
-_PS_EXTERN_CONST(am_m1, -1.0f);
-_PS_EXTERN_CONST(am_0p5, 0.5f);
-_PS_EXTERN_CONST(am_1p5, 1.5f);
-_PS_EXTERN_CONST(am_pi, (float)M_PI);
-_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
-_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
-_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
-_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
-_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
-_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
-_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
-_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
-_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
-
-_EPI32_CONST(1, 1);
-_EPI32_CONST(2, 2);
-
-_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
-_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
-_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
-_PS_CONST(sincos_p3, -0.468175413106023168e-2f);
-
-#ifdef PFN_VECTORMA
-void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
-#endif
-
-//-----------------------------------------------------------------------------
-// SSE implementations of optimized routines:
-//-----------------------------------------------------------------------------
-float _SSE_Sqrt(float x)
-{
- Assert( s_bMathlibInitialized );
- float root = 0.f;
-#ifdef _WIN32
- _asm
- {
- sqrtss xmm0, x
- movss root, xmm0
- }
-#elif POSIX
- _mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
-#endif
- return root;
-}
-
-// Single iteration NewtonRaphson reciprocal square root:
-// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x))
-// Very low error, and fine to use in place of 1.f / sqrtf(x).
-#if 0
-float _SSE_RSqrtAccurate(float x)
-{
- Assert( s_bMathlibInitialized );
-
- float rroot;
- _asm
- {
- rsqrtss xmm0, x
- movss rroot, xmm0
- }
-
- return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
-}
-#else
-
-#ifdef POSIX
-const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value
-const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value
-#endif
-
-// Intel / Kipps SSE RSqrt. Significantly faster than above.
-float _SSE_RSqrtAccurate(float a)
-{
-
-#ifdef _WIN32
- float x;
- float half = 0.5f;
- float three = 3.f;
-
- __asm
- {
- movss xmm3, a;
- movss xmm1, half;
- movss xmm2, three;
- rsqrtss xmm0, xmm3;
-
- mulss xmm3, xmm0;
- mulss xmm1, xmm0;
- mulss xmm3, xmm0;
- subss xmm2, xmm3;
- mulss xmm1, xmm2;
-
- movss x, xmm1;
- }
-
- return x;
-#elif POSIX
- __m128 xx = _mm_load_ss( &a );
- __m128 xr = _mm_rsqrt_ss( xx );
- __m128 xt;
-
- xt = _mm_mul_ss( xr, xr );
- xt = _mm_mul_ss( xt, xx );
- xt = _mm_sub_ss( f3, xt );
- xt = _mm_mul_ss( xt, f05 );
- xr = _mm_mul_ss( xr, xt );
-
- _mm_store_ss( &a, xr );
- return a;
-#else
- #error "Not Implemented"
-#endif
-
-}
-#endif
-
-// Simple SSE rsqrt. Usually accurate to around 6 (relative) decimal places
-// or so, so ok for closed transforms. (ie, computing lighting normals)
-float _SSE_RSqrtFast(float x)
-{
- Assert( s_bMathlibInitialized );
-
- float rroot;
-#ifdef _WIN32
- _asm
- {
- rsqrtss xmm0, x
- movss rroot, xmm0
- }
-#elif POSIX
- __asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
-#else
-#error
-#endif
-
- return rroot;
-}
-
-float FASTCALL _SSE_VectorNormalize (Vector& vec)
-{
- Assert( s_bMathlibInitialized );
-
- // NOTE: This is necessary to prevent an memory overwrite...
- // sice vec only has 3 floats, we can't "movaps" directly into it.
-#ifdef _WIN32
- __declspec(align(16)) float result[4];
-#elif POSIX
- float result[4] __attribute__((aligned(16)));
-#endif
-
- float *v = &vec[0];
- float *r = &result[0];
-
- float radius = 0.f;
- // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't
- // be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
- if ( v[0] || v[1] || v[2] )
- {
-#ifdef _WIN32
- _asm
- {
- mov eax, v
- mov edx, r
-#ifdef ALIGNED_VECTOR
- movaps xmm4, [eax] // r4 = vx, vy, vz, X
- movaps xmm1, xmm4 // r1 = r4
-#else
- movups xmm4, [eax] // r4 = vx, vy, vz, X
- movaps xmm1, xmm4 // r1 = r4
-#endif
- mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X
- movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X
- movaps xmm2, xmm1 // r2 = r1
- shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X
- addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X
- addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
- sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
- movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
- rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X
- shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X
- mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
- movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
- }
-#elif POSIX
- __asm__ __volatile__(
-#ifdef ALIGNED_VECTOR
- "movaps %2, %%xmm4 \n\t"
- "movaps %%xmm4, %%xmm1 \n\t"
-#else
- "movups %2, %%xmm4 \n\t"
- "movaps %%xmm4, %%xmm1 \n\t"
-#endif
- "mulps %%xmm4, %%xmm1 \n\t"
- "movhlps %%xmm1, %%xmm3 \n\t"
- "movaps %%xmm1, %%xmm2 \n\t"
- "shufps $1, %%xmm2, %%xmm2 \n\t"
- "addss %%xmm2, %%xmm1 \n\t"
- "addss %%xmm3, %%xmm1 \n\t"
- "sqrtss %%xmm1, %%xmm1 \n\t"
- "movss %%xmm1, %0 \n\t"
- "rcpss %%xmm1, %%xmm1 \n\t"
- "shufps $0, %%xmm1, %%xmm1 \n\t"
- "mulps %%xmm1, %%xmm4 \n\t"
- "movaps %%xmm4, %1 \n\t"
- : "=m" (radius), "=m" (result)
- : "m" (*v)
- );
-#else
- #error "Not Implemented"
-#endif
- vec.x = result[0];
- vec.y = result[1];
- vec.z = result[2];
-
- }
-
- return radius;
-}
-
-void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
-{
- float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
-
- vec.x *= ool;
- vec.y *= ool;
- vec.z *= ool;
-}
-
-float _SSE_InvRSquared(const float* v)
-{
- float inv_r2 = 1.f;
-#ifdef _WIN32
- _asm { // Intel SSE only routine
- mov eax, v
- movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0
-#ifdef ALIGNED_VECTOR
- movaps xmm4, [eax] // x4 = vx, vy, vz, X
-#else
- movups xmm4, [eax] // x4 = vx, vy, vz, X
-#endif
- movaps xmm1, xmm4 // x1 = x4
- mulps xmm1, xmm4 // x1 = vx * vx, vy * vy, vz * vz, X
- movhlps xmm3, xmm1 // x3 = vz * vz, X, X, X
- movaps xmm2, xmm1 // x2 = x1
- shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X
- addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X
- addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
- maxss xmm1, xmm5 // x1 = max( 1.0, x1 )
- rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 )
- movss inv_r2, xmm0 // inv_r2 = x0
- }
-#elif POSIX
- __asm__ __volatile__(
- "movss %0, %%xmm5 \n\t"
-#ifdef ALIGNED_VECTOR
- "movaps %1, %%xmm4 \n\t"
-#else
- "movups %1, %%xmm4 \n\t"
-#endif
- "movaps %%xmm4, %%xmm1 \n\t"
- "mulps %%xmm4, %%xmm1 \n\t"
- "movhlps %%xmm1, %%xmm3 \n\t"
- "movaps %%xmm1, %%xmm2 \n\t"
- "shufps $1, %%xmm2, %%xmm2 \n\t"
- "addss %%xmm2, %%xmm1 \n\t"
- "addss %%xmm3, %%xmm1 \n\t"
- "maxss %%xmm5, %%xmm1 \n\t"
- "rcpss %%xmm1, %%xmm0 \n\t"
- "movss %%xmm0, %0 \n\t"
- : "=m" (inv_r2)
- : "m" (*v), "0" (inv_r2)
- );
-#else
- #error "Not Implemented"
-#endif
-
- return inv_r2;
-}
-
-
-#ifdef POSIX
-// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
-#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
-
-_PS_CONST_TYPE(sign_mask, int, 0x80000000);
-_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-
-#define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
-
-_PI32_CONST(1, 1);
-_PI32_CONST(inv1, ~1);
-_PI32_CONST(2, 2);
-_PI32_CONST(4, 4);
-_PI32_CONST(0x7f, 0x7f);
-_PS_CONST(1 , 1.0f);
-_PS_CONST(0p5, 0.5f);
-
-_PS_CONST(minus_cephes_DP1, -0.78515625);
-_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS_CONST(sincof_p0, -1.9515295891E-4);
-_PS_CONST(sincof_p1, 8.3321608736E-3);
-_PS_CONST(sincof_p2, -1.6666654611E-1);
-_PS_CONST(coscof_p0, 2.443315711809948E-005);
-_PS_CONST(coscof_p1, -1.388731625493765E-003);
-_PS_CONST(coscof_p2, 4.166664568298827E-002);
-_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
-typedef union xmm_mm_union {
- __m128 xmm;
- __m64 mm[2];
-} xmm_mm_union;
-
-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
-
-typedef __m128 v4sf; // vector of 4 float (sse1)
-typedef __m64 v2si; // vector of 2 int (mmx)
-
-#endif
-
-void _SSE_SinCos(float x, float* s, float* c)
-{
-#ifdef _WIN32
- float t4, t8, t12;
-
- __asm
- {
- movss xmm0, x
- movss t12, xmm0
- movss xmm1, _ps_am_inv_sign_mask
- mov eax, t12
- mulss xmm0, _ps_am_2_o_pi
- andps xmm0, xmm1
- and eax, 0x80000000
-
- cvttss2si edx, xmm0
- mov ecx, edx
- mov t12, esi
- mov esi, edx
- add edx, 0x1
- shl ecx, (31 - 1)
- shl edx, (31 - 1)
-
- movss xmm4, _ps_am_1
- cvtsi2ss xmm3, esi
- mov t8, eax
- and esi, 0x1
-
- subss xmm0, xmm3
- movss xmm3, _sincos_inv_masks[esi * 4]
- minss xmm0, xmm4
-
- subss xmm4, xmm0
-
- movss xmm6, xmm4
- andps xmm4, xmm3
- and ecx, 0x80000000
- movss xmm2, xmm3
- andnps xmm3, xmm0
- and edx, 0x80000000
- movss xmm7, t8
- andps xmm0, xmm2
- mov t8, ecx
- mov t4, edx
- orps xmm4, xmm3
-
- mov eax, s //mov eax, [esp + 4 + 16]
- mov edx, c //mov edx, [esp + 4 + 16 + 4]
-
- andnps xmm2, xmm6
- orps xmm0, xmm2
-
- movss xmm2, t8
- movss xmm1, xmm0
- movss xmm5, xmm4
- xorps xmm7, xmm2
- movss xmm3, _ps_sincos_p3
- mulss xmm0, xmm0
- mulss xmm4, xmm4
- movss xmm2, xmm0
- movss xmm6, xmm4
- orps xmm1, xmm7
- movss xmm7, _ps_sincos_p2
- mulss xmm0, xmm3
- mulss xmm4, xmm3
- movss xmm3, _ps_sincos_p1
- addss xmm0, xmm7
- addss xmm4, xmm7
- movss xmm7, _ps_sincos_p0
- mulss xmm0, xmm2
- mulss xmm4, xmm6
- addss xmm0, xmm3
- addss xmm4, xmm3
- movss xmm3, t4
- mulss xmm0, xmm2
- mulss xmm4, xmm6
- orps xmm5, xmm3
- mov esi, t12
- addss xmm0, xmm7
- addss xmm4, xmm7
- mulss xmm0, xmm1
- mulss xmm4, xmm5
-
- // use full stores since caller might reload with full loads
- movss [eax], xmm0
- movss [edx], xmm4
- }
-#elif POSIX
-
- Assert( "Needs testing, verify impl!\n" );
-
- v4sf xx = _mm_load_ss( &x );
-
- v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
- v2si mm0, mm1, mm2, mm3, mm4, mm5;
- sign_bit_sin = xx;
- /* take the absolute value */
- xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
- /* extract the sign bit (upper one) */
- sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
-
- /* scale by 4/Pi */
- y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
-
- /* store the integer part of y in mm2:mm3 */
- xmm3 = _mm_movehl_ps(xmm3, y);
- mm2 = _mm_cvttps_pi32(y);
- mm3 = _mm_cvttps_pi32(xmm3);
-
- /* j=(j+1) & (~1) (see the cephes sources) */
- mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
- mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-
- y = _mm_cvtpi32x2_ps(mm2, mm3);
-
- mm4 = mm2;
- mm5 = mm3;
-
- /* get the swap sign flag for the sine */
- mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
- mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
- mm0 = _mm_slli_pi32(mm0, 29);
- mm1 = _mm_slli_pi32(mm1, 29);
- v4sf swap_sign_bit_sin;
- COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
-
- /* get the polynom selection mask for the sine */
-
- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
- v4sf poly_mask;
- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
- xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
- xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
- xmm1 = _mm_mul_ps(y, xmm1);
- xmm2 = _mm_mul_ps(y, xmm2);
- xmm3 = _mm_mul_ps(y, xmm3);
- xx = _mm_add_ps(xx, xmm1);
- xx = _mm_add_ps(xx, xmm2);
- xx = _mm_add_ps(xx, xmm3);
-
- /* get the sign flag for the cosine */
- mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
- mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
- mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
- mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
- mm4 = _mm_slli_pi32(mm4, 29);
- mm5 = _mm_slli_pi32(mm5, 29);
- v4sf sign_bit_cos;
- COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
- _mm_empty(); /* good-bye mmx */
-
- sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- v4sf z = _mm_mul_ps(xx,xx);
- y = *(v4sf*)_ps_coscof_p0;
-
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
- y = _mm_mul_ps(y, z);
- y = _mm_mul_ps(y, z);
- v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
- y = _mm_sub_ps(y, tmp);
- y = _mm_add_ps(y, *(v4sf*)_ps_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- v4sf y2 = *(v4sf*)_ps_sincof_p0;
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_mul_ps(y2, xx);
- y2 = _mm_add_ps(y2, xx);
-
- /* select the correct result from the two polynoms */
- xmm3 = poly_mask;
- v4sf ysin2 = _mm_and_ps(xmm3, y2);
- v4sf ysin1 = _mm_andnot_ps(xmm3, y);
- y2 = _mm_sub_ps(y2,ysin2);
- y = _mm_sub_ps(y, ysin1);
-
- xmm1 = _mm_add_ps(ysin1,ysin2);
- xmm2 = _mm_add_ps(y,y2);
-
- /* update the sign */
- _mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
- _mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
-
-#else
- #error "Not Implemented"
-#endif
-}
-
-float _SSE_cos( float x )
-{
-#ifdef _WIN32
- float temp;
- __asm
- {
- movss xmm0, x
- movss xmm1, _ps_am_inv_sign_mask
- andps xmm0, xmm1
- addss xmm0, _ps_am_pi_o_2
- mulss xmm0, _ps_am_2_o_pi
-
- cvttss2si ecx, xmm0
- movss xmm5, _ps_am_1
- mov edx, ecx
- shl edx, (31 - 1)
- cvtsi2ss xmm1, ecx
- and edx, 0x80000000
- and ecx, 0x1
-
- subss xmm0, xmm1
- movss xmm6, _sincos_masks[ecx * 4]
- minss xmm0, xmm5
-
- movss xmm1, _ps_sincos_p3
- subss xmm5, xmm0
-
- andps xmm5, xmm6
- movss xmm7, _ps_sincos_p2
- andnps xmm6, xmm0
- mov temp, edx
- orps xmm5, xmm6
- movss xmm0, xmm5
-
- mulss xmm5, xmm5
- movss xmm4, _ps_sincos_p1
- movss xmm2, xmm5
- mulss xmm5, xmm1
- movss xmm1, _ps_sincos_p0
- addss xmm5, xmm7
- mulss xmm5, xmm2
- movss xmm3, temp
- addss xmm5, xmm4
- mulss xmm5, xmm2
- orps xmm0, xmm3
- addss xmm5, xmm1
- mulss xmm0, xmm5
-
- movss x, xmm0
-
- }
-#elif POSIX
-
- Assert( "Needs testing, verify impl!\n" );
-
- v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
- v2si mm0, mm1, mm2, mm3;
- /* take the absolute value */
- v4sf xx = _mm_load_ss( &x );
-
- xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
-
- /* scale by 4/Pi */
- y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
-
- /* store the integer part of y in mm0:mm1 */
- xmm2 = _mm_movehl_ps(xmm2, y);
- mm2 = _mm_cvttps_pi32(y);
- mm3 = _mm_cvttps_pi32(xmm2);
-
- /* j=(j+1) & (~1) (see the cephes sources) */
- mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
- mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-
- y = _mm_cvtpi32x2_ps(mm2, mm3);
-
-
- mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
- mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
-
- /* get the swap sign flag in mm0:mm1 and the
- polynom selection mask in mm2:mm3 */
-
- mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
- mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
- mm0 = _mm_slli_pi32(mm0, 29);
- mm1 = _mm_slli_pi32(mm1, 29);
-
- mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
- mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-
- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-
- v4sf sign_bit, poly_mask;
- COPY_MM_TO_XMM(mm0, mm1, sign_bit);
- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
- _mm_empty(); /* good-bye mmx */
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
- xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
- xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
- xmm1 = _mm_mul_ps(y, xmm1);
- xmm2 = _mm_mul_ps(y, xmm2);
- xmm3 = _mm_mul_ps(y, xmm3);
- xx = _mm_add_ps(xx, xmm1);
- xx = _mm_add_ps(xx, xmm2);
- xx = _mm_add_ps(xx, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = *(v4sf*)_ps_coscof_p0;
- v4sf z = _mm_mul_ps(xx,xx);
-
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
- y = _mm_mul_ps(y, z);
- y = _mm_mul_ps(y, z);
- v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
- y = _mm_sub_ps(y, tmp);
- y = _mm_add_ps(y, *(v4sf*)_ps_1);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- v4sf y2 = *(v4sf*)_ps_sincof_p0;
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_mul_ps(y2, xx);
- y2 = _mm_add_ps(y2, xx);
-
- /* select the correct result from the two polynoms */
- xmm3 = poly_mask;
- y2 = _mm_and_ps(xmm3, y2); //, xmm3);
- y = _mm_andnot_ps(xmm3, y);
- y = _mm_add_ps(y,y2);
- /* update the sign */
-
- _mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
-
-#else
- #error "Not Implemented"
-#endif
-
- return x;
-}
-
-//-----------------------------------------------------------------------------
-// SSE2 implementations of optimized routines:
-//-----------------------------------------------------------------------------
-void _SSE2_SinCos(float x, float* s, float* c) // any x
-{
-#ifdef _WIN32
- __asm
- {
- movss xmm0, x
- movaps xmm7, xmm0
- movss xmm1, _ps_am_inv_sign_mask
- movss xmm2, _ps_am_sign_mask
- movss xmm3, _ps_am_2_o_pi
- andps xmm0, xmm1
- andps xmm7, xmm2
- mulss xmm0, xmm3
-
- pxor xmm3, xmm3
- movd xmm5, _epi32_1
- movss xmm4, _ps_am_1
-
- cvttps2dq xmm2, xmm0
- pand xmm5, xmm2
- movd xmm1, _epi32_2
- pcmpeqd xmm5, xmm3
- movd xmm3, _epi32_1
- cvtdq2ps xmm6, xmm2
- paddd xmm3, xmm2
- pand xmm2, xmm1
- pand xmm3, xmm1
- subss xmm0, xmm6
- pslld xmm2, (31 - 1)
- minss xmm0, xmm4
-
- mov eax, s // mov eax, [esp + 4 + 16]
- mov edx, c // mov edx, [esp + 4 + 16 + 4]
-
- subss xmm4, xmm0
- pslld xmm3, (31 - 1)
-
- movaps xmm6, xmm4
- xorps xmm2, xmm7
- movaps xmm7, xmm5
- andps xmm6, xmm7
- andnps xmm7, xmm0
- andps xmm0, xmm5
- andnps xmm5, xmm4
- movss xmm4, _ps_sincos_p3
- orps xmm6, xmm7
- orps xmm0, xmm5
- movss xmm5, _ps_sincos_p2
-
- movaps xmm1, xmm0
- movaps xmm7, xmm6
- mulss xmm0, xmm0
- mulss xmm6, xmm6
- orps xmm1, xmm2
- orps xmm7, xmm3
- movaps xmm2, xmm0
- movaps xmm3, xmm6
- mulss xmm0, xmm4
- mulss xmm6, xmm4
- movss xmm4, _ps_sincos_p1
- addss xmm0, xmm5
- addss xmm6, xmm5
- movss xmm5, _ps_sincos_p0
- mulss xmm0, xmm2
- mulss xmm6, xmm3
- addss xmm0, xmm4
- addss xmm6, xmm4
- mulss xmm0, xmm2
- mulss xmm6, xmm3
- addss xmm0, xmm5
- addss xmm6, xmm5
- mulss xmm0, xmm1
- mulss xmm6, xmm7
-
- // use full stores since caller might reload with full loads
- movss [eax], xmm0
- movss [edx], xmm6
- }
-#elif POSIX
- #warning "_SSE2_SinCos NOT implemented!"
- Assert( 0 );
-#else
- #error "Not Implemented"
-#endif
-}
-
-float _SSE2_cos(float x)
-{
-#ifdef _WIN32
- __asm
- {
- movss xmm0, x
- movss xmm1, _ps_am_inv_sign_mask
- movss xmm2, _ps_am_pi_o_2
- movss xmm3, _ps_am_2_o_pi
- andps xmm0, xmm1
- addss xmm0, xmm2
- mulss xmm0, xmm3
-
- pxor xmm3, xmm3
- movd xmm5, _epi32_1
- movss xmm4, _ps_am_1
- cvttps2dq xmm2, xmm0
- pand xmm5, xmm2
- movd xmm1, _epi32_2
- pcmpeqd xmm5, xmm3
- cvtdq2ps xmm6, xmm2
- pand xmm2, xmm1
- pslld xmm2, (31 - 1)
-
- subss xmm0, xmm6
- movss xmm3, _ps_sincos_p3
- minss xmm0, xmm4
- subss xmm4, xmm0
- andps xmm0, xmm5
- andnps xmm5, xmm4
- orps xmm0, xmm5
-
- movaps xmm1, xmm0
- movss xmm4, _ps_sincos_p2
- mulss xmm0, xmm0
- movss xmm5, _ps_sincos_p1
- orps xmm1, xmm2
- movaps xmm7, xmm0
- mulss xmm0, xmm3
- movss xmm6, _ps_sincos_p0
- addss xmm0, xmm4
- mulss xmm0, xmm7
- addss xmm0, xmm5
- mulss xmm0, xmm7
- addss xmm0, xmm6
- mulss xmm0, xmm1
- movss x, xmm0
- }
-#elif POSIX
- #warning "_SSE2_cos NOT implemented!"
- Assert( 0 );
-#else
- #error "Not Implemented"
-#endif
-
- return x;
-}
-
-// SSE Version of VectorTransform
-void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
-{
- Assert( s_bMathlibInitialized );
- Assert( in1 != out1 );
-
-#ifdef _WIN32
- __asm
- {
- mov eax, in1;
- mov ecx, in2;
- mov edx, out1;
-
- movss xmm0, [eax];
- mulss xmm0, [ecx];
- movss xmm1, [eax+4];
- mulss xmm1, [ecx+4];
- movss xmm2, [eax+8];
- mulss xmm2, [ecx+8];
- addss xmm0, xmm1;
- addss xmm0, xmm2;
- addss xmm0, [ecx+12]
- movss [edx], xmm0;
- add ecx, 16;
-
- movss xmm0, [eax];
- mulss xmm0, [ecx];
- movss xmm1, [eax+4];
- mulss xmm1, [ecx+4];
- movss xmm2, [eax+8];
- mulss xmm2, [ecx+8];
- addss xmm0, xmm1;
- addss xmm0, xmm2;
- addss xmm0, [ecx+12]
- movss [edx+4], xmm0;
- add ecx, 16;
-
- movss xmm0, [eax];
- mulss xmm0, [ecx];
- movss xmm1, [eax+4];
- mulss xmm1, [ecx+4];
- movss xmm2, [eax+8];
- mulss xmm2, [ecx+8];
- addss xmm0, xmm1;
- addss xmm0, xmm2;
- addss xmm0, [ecx+12]
- movss [edx+8], xmm0;
- }
-#elif POSIX
- #warning "VectorTransformSSE C implementation only"
- out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
- out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
- out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
-#else
- #error "Not Implemented"
-#endif
-}
-
-void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
-{
- Assert( s_bMathlibInitialized );
- Assert( in1 != out1 );
-
-#ifdef _WIN32
- __asm
- {
- mov eax, in1;
- mov ecx, in2;
- mov edx, out1;
-
- movss xmm0, [eax];
- mulss xmm0, [ecx];
- movss xmm1, [eax+4];
- mulss xmm1, [ecx+4];
- movss xmm2, [eax+8];
- mulss xmm2, [ecx+8];
- addss xmm0, xmm1;
- addss xmm0, xmm2;
- movss [edx], xmm0;
- add ecx, 16;
-
- movss xmm0, [eax];
- mulss xmm0, [ecx];
- movss xmm1, [eax+4];
- mulss xmm1, [ecx+4];
- movss xmm2, [eax+8];
- mulss xmm2, [ecx+8];
- addss xmm0, xmm1;
- addss xmm0, xmm2;
- movss [edx+4], xmm0;
- add ecx, 16;
-
- movss xmm0, [eax];
- mulss xmm0, [ecx];
- movss xmm1, [eax+4];
- mulss xmm1, [ecx+4];
- movss xmm2, [eax+8];
- mulss xmm2, [ecx+8];
- addss xmm0, xmm1;
- addss xmm0, xmm2;
- movss [edx+8], xmm0;
- }
-#elif POSIX
- #warning "VectorRotateSSE C implementation only"
- out1[0] = DotProduct( in1, in2[0] );
- out1[1] = DotProduct( in1, in2[1] );
- out1[2] = DotProduct( in1, in2[2] );
-#else
- #error "Not Implemented"
-#endif
-}
-
-#ifdef _WIN32
-void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
-{
- // FIXME: This don't work!! It will overwrite memory in the write to dest
- Assert(0);
-
- Assert( s_bMathlibInitialized );
- _asm { // Intel SSE only routine
- mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
- mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
- mov edx, DWORD PTR [esp+0x10] ; *dest
- movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
-#ifdef ALIGNED_VECTOR
- movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
- pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
- movaps xmm1, [eax] ; x1 = start1, start2, start3, X
- mulps xmm3, xmm2 ; x3 *= x2
- addps xmm3, xmm1 ; x3 += x1
- movaps [edx], xmm3 ; *dest = x3
-#else
- movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
- pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
- movups xmm1, [eax] ; x1 = start1, start2, start3, X
- mulps xmm3, xmm2 ; x3 *= x2
- addps xmm3, xmm1 ; x3 += x1
- movups [edx], xmm3 ; *dest = x3
-#endif
- }
-}
-#endif
-
-#ifdef _WIN32
-#ifdef PFN_VECTORMA
-void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
-{
- // FIXME: This don't work!! It will overwrite memory in the write to dest
- Assert(0);
-
- Assert( s_bMathlibInitialized );
- _asm
- {
- // Intel SSE only routine
- mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
- mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
- mov edx, DWORD PTR [esp+0x10] ; *dest
- movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
-#ifdef ALIGNED_VECTOR
- movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
- pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
- movaps xmm1, [eax] ; x1 = start1, start2, start3, X
- mulps xmm3, xmm2 ; x3 *= x2
- addps xmm3, xmm1 ; x3 += x1
- movaps [edx], xmm3 ; *dest = x3
-#else
- movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
- pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
- movups xmm1, [eax] ; x1 = start1, start2, start3, X
- mulps xmm3, xmm2 ; x3 *= x2
- addps xmm3, xmm1 ; x3 += x1
- movups [edx], xmm3 ; *dest = x3
-#endif
- }
-}
-float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
-#endif
-#endif
-
-
-// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
-// Should be validated too! :)
-// NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like
-// this, but when inlined, and instruction scheduled, the C version is faster.
-// Verified this via VTune
-/*
-vec_t DotProduct (const vec_t *a, const vec_t *c)
-{
- vec_t temp;
-
- __asm
- {
- mov eax, a;
- mov ecx, c;
- mov edx, DWORD PTR [temp]
- movss xmm0, [eax];
- mulss xmm0, [ecx];
- movss xmm1, [eax+4];
- mulss xmm1, [ecx+4];
- movss xmm2, [eax+8];
- mulss xmm2, [ecx+8];
- addss xmm0, xmm1;
- addss xmm0, xmm2;
- movss [edx], xmm0;
- fld DWORD PTR [edx];
- ret
- }
-}
-*/
-
-#endif // COMPILER_MSVC64
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: SSE Math primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h> // Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "sse.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+#ifndef COMPILER_MSVC64
+// Implement for 64-bit Windows if needed.
+
+static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 };
+static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
+
+//-----------------------------------------------------------------------------
+// Macros and constants required by some of the SSE assembly:
+//-----------------------------------------------------------------------------
+
+#ifdef _WIN32
+ #define _PS_EXTERN_CONST(Name, Val) \
+ const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+
+ #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+ const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+
+ #define _EPI32_CONST(Name, Val) \
+ static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+
+ #define _PS_CONST(Name, Val) \
+ static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+#elif POSIX
+ #define _PS_EXTERN_CONST(Name, Val) \
+ const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
+
+ #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+ const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
+
+ #define _EPI32_CONST(Name, Val) \
+ static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
+
+ #define _PS_CONST(Name, Val) \
+ static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
+#endif
+
+_PS_EXTERN_CONST(am_0, 0.0f);
+_PS_EXTERN_CONST(am_1, 1.0f);
+_PS_EXTERN_CONST(am_m1, -1.0f);
+_PS_EXTERN_CONST(am_0p5, 0.5f);
+_PS_EXTERN_CONST(am_1p5, 1.5f);
+_PS_EXTERN_CONST(am_pi, (float)M_PI);
+_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
+_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
+_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
+_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
+_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
+_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
+_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
+_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
+_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
+
+_EPI32_CONST(1, 1);
+_EPI32_CONST(2, 2);
+
+_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
+_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
+_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
+_PS_CONST(sincos_p3, -0.468175413106023168e-2f);
+
+#ifdef PFN_VECTORMA
+void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
+#endif
+
+//-----------------------------------------------------------------------------
+// SSE implementations of optimized routines:
+//-----------------------------------------------------------------------------
+float _SSE_Sqrt(float x)
+{
+ Assert( s_bMathlibInitialized );
+ float root = 0.f;
+#ifdef _WIN32
+ _asm
+ {
+ sqrtss xmm0, x
+ movss root, xmm0
+ }
+#elif POSIX
+ _mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
+#endif
+ return root;
+}
+
+// Single iteration NewtonRaphson reciprocal square root:
+// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x))
+// Very low error, and fine to use in place of 1.f / sqrtf(x).
+#if 0
+float _SSE_RSqrtAccurate(float x)
+{
+ Assert( s_bMathlibInitialized );
+
+ float rroot;
+ _asm
+ {
+ rsqrtss xmm0, x
+ movss rroot, xmm0
+ }
+
+ return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
+}
+#else
+
+#ifdef POSIX
+const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value
+const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value
+#endif
+
+// Intel / Kipps SSE RSqrt. Significantly faster than above.
+float _SSE_RSqrtAccurate(float a)
+{
+
+#ifdef _WIN32
+ float x;
+ float half = 0.5f;
+ float three = 3.f;
+
+ __asm
+ {
+ movss xmm3, a;
+ movss xmm1, half;
+ movss xmm2, three;
+ rsqrtss xmm0, xmm3;
+
+ mulss xmm3, xmm0;
+ mulss xmm1, xmm0;
+ mulss xmm3, xmm0;
+ subss xmm2, xmm3;
+ mulss xmm1, xmm2;
+
+ movss x, xmm1;
+ }
+
+ return x;
+#elif POSIX
+ __m128 xx = _mm_load_ss( &a );
+ __m128 xr = _mm_rsqrt_ss( xx );
+ __m128 xt;
+
+ xt = _mm_mul_ss( xr, xr );
+ xt = _mm_mul_ss( xt, xx );
+ xt = _mm_sub_ss( f3, xt );
+ xt = _mm_mul_ss( xt, f05 );
+ xr = _mm_mul_ss( xr, xt );
+
+ _mm_store_ss( &a, xr );
+ return a;
+#else
+ #error "Not Implemented"
+#endif
+
+}
+#endif
+
+// Simple SSE rsqrt. Usually accurate to around 6 (relative) decimal places
+// or so, so ok for closed transforms. (ie, computing lighting normals)
+float _SSE_RSqrtFast(float x)
+{
+ Assert( s_bMathlibInitialized );
+
+ float rroot;
+#ifdef _WIN32
+ _asm
+ {
+ rsqrtss xmm0, x
+ movss rroot, xmm0
+ }
+#elif POSIX
+ __asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
+#else
+#error
+#endif
+
+ return rroot;
+}
+
+float FASTCALL _SSE_VectorNormalize (Vector& vec)
+{
+ Assert( s_bMathlibInitialized );
+
+ // NOTE: This is necessary to prevent an memory overwrite...
+ // sice vec only has 3 floats, we can't "movaps" directly into it.
+#ifdef _WIN32
+ __declspec(align(16)) float result[4];
+#elif POSIX
+ float result[4] __attribute__((aligned(16)));
+#endif
+
+ float *v = &vec[0];
+ float *r = &result[0];
+
+ float radius = 0.f;
+ // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't
+ // be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
+ if ( v[0] || v[1] || v[2] )
+ {
+#ifdef _WIN32
+ _asm
+ {
+ mov eax, v
+ mov edx, r
+#ifdef ALIGNED_VECTOR
+ movaps xmm4, [eax] // r4 = vx, vy, vz, X
+ movaps xmm1, xmm4 // r1 = r4
+#else
+ movups xmm4, [eax] // r4 = vx, vy, vz, X
+ movaps xmm1, xmm4 // r1 = r4
+#endif
+ mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X
+ movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X
+ movaps xmm2, xmm1 // r2 = r1
+ shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X
+ addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X
+ addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+ sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
+ movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
+ rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X
+ shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X
+ mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+ movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+ }
+#elif POSIX
+ __asm__ __volatile__(
+#ifdef ALIGNED_VECTOR
+ "movaps %2, %%xmm4 \n\t"
+ "movaps %%xmm4, %%xmm1 \n\t"
+#else
+ "movups %2, %%xmm4 \n\t"
+ "movaps %%xmm4, %%xmm1 \n\t"
+#endif
+ "mulps %%xmm4, %%xmm1 \n\t"
+ "movhlps %%xmm1, %%xmm3 \n\t"
+ "movaps %%xmm1, %%xmm2 \n\t"
+ "shufps $1, %%xmm2, %%xmm2 \n\t"
+ "addss %%xmm2, %%xmm1 \n\t"
+ "addss %%xmm3, %%xmm1 \n\t"
+ "sqrtss %%xmm1, %%xmm1 \n\t"
+ "movss %%xmm1, %0 \n\t"
+ "rcpss %%xmm1, %%xmm1 \n\t"
+ "shufps $0, %%xmm1, %%xmm1 \n\t"
+ "mulps %%xmm1, %%xmm4 \n\t"
+ "movaps %%xmm4, %1 \n\t"
+ : "=m" (radius), "=m" (result)
+ : "m" (*v)
+ );
+#else
+ #error "Not Implemented"
+#endif
+ vec.x = result[0];
+ vec.y = result[1];
+ vec.z = result[2];
+
+ }
+
+ return radius;
+}
+
+void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
+{
+ float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
+
+ vec.x *= ool;
+ vec.y *= ool;
+ vec.z *= ool;
+}
+
+float _SSE_InvRSquared(const float* v)
+{
+ float inv_r2 = 1.f;
+#ifdef _WIN32
+ _asm { // Intel SSE only routine
+ mov eax, v
+ movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+ movaps xmm4, [eax] // x4 = vx, vy, vz, X
+#else
+ movups xmm4, [eax] // x4 = vx, vy, vz, X
+#endif
+ movaps xmm1, xmm4 // x1 = x4
+ mulps xmm1, xmm4 // x1 = vx * vx, vy * vy, vz * vz, X
+ movhlps xmm3, xmm1 // x3 = vz * vz, X, X, X
+ movaps xmm2, xmm1 // x2 = x1
+ shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X
+ addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X
+ addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+ maxss xmm1, xmm5 // x1 = max( 1.0, x1 )
+ rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 )
+ movss inv_r2, xmm0 // inv_r2 = x0
+ }
+#elif POSIX
+ __asm__ __volatile__(
+ "movss %0, %%xmm5 \n\t"
+#ifdef ALIGNED_VECTOR
+ "movaps %1, %%xmm4 \n\t"
+#else
+ "movups %1, %%xmm4 \n\t"
+#endif
+ "movaps %%xmm4, %%xmm1 \n\t"
+ "mulps %%xmm4, %%xmm1 \n\t"
+ "movhlps %%xmm1, %%xmm3 \n\t"
+ "movaps %%xmm1, %%xmm2 \n\t"
+ "shufps $1, %%xmm2, %%xmm2 \n\t"
+ "addss %%xmm2, %%xmm1 \n\t"
+ "addss %%xmm3, %%xmm1 \n\t"
+ "maxss %%xmm5, %%xmm1 \n\t"
+ "rcpss %%xmm1, %%xmm0 \n\t"
+ "movss %%xmm0, %0 \n\t"
+ : "=m" (inv_r2)
+ : "m" (*v), "0" (inv_r2)
+ );
+#else
+ #error "Not Implemented"
+#endif
+
+ return inv_r2;
+}
+
+
+#ifdef POSIX
+// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+
+#define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+_PS_CONST(1 , 1.0f);
+_PS_CONST(0p5, 0.5f);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1, 8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0, 2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2, 4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+typedef union xmm_mm_union {
+ __m128 xmm;
+ __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
+
+typedef __m128 v4sf; // vector of 4 float (sse1)
+typedef __m64 v2si; // vector of 2 int (mmx)
+
+#endif
+
+void _SSE_SinCos(float x, float* s, float* c)
+{
+#ifdef _WIN32
+ float t4, t8, t12;
+
+ __asm
+ {
+ movss xmm0, x
+ movss t12, xmm0
+ movss xmm1, _ps_am_inv_sign_mask
+ mov eax, t12
+ mulss xmm0, _ps_am_2_o_pi
+ andps xmm0, xmm1
+ and eax, 0x80000000
+
+ cvttss2si edx, xmm0
+ mov ecx, edx
+ mov t12, esi
+ mov esi, edx
+ add edx, 0x1
+ shl ecx, (31 - 1)
+ shl edx, (31 - 1)
+
+ movss xmm4, _ps_am_1
+ cvtsi2ss xmm3, esi
+ mov t8, eax
+ and esi, 0x1
+
+ subss xmm0, xmm3
+ movss xmm3, _sincos_inv_masks[esi * 4]
+ minss xmm0, xmm4
+
+ subss xmm4, xmm0
+
+ movss xmm6, xmm4
+ andps xmm4, xmm3
+ and ecx, 0x80000000
+ movss xmm2, xmm3
+ andnps xmm3, xmm0
+ and edx, 0x80000000
+ movss xmm7, t8
+ andps xmm0, xmm2
+ mov t8, ecx
+ mov t4, edx
+ orps xmm4, xmm3
+
+ mov eax, s //mov eax, [esp + 4 + 16]
+ mov edx, c //mov edx, [esp + 4 + 16 + 4]
+
+ andnps xmm2, xmm6
+ orps xmm0, xmm2
+
+ movss xmm2, t8
+ movss xmm1, xmm0
+ movss xmm5, xmm4
+ xorps xmm7, xmm2
+ movss xmm3, _ps_sincos_p3
+ mulss xmm0, xmm0
+ mulss xmm4, xmm4
+ movss xmm2, xmm0
+ movss xmm6, xmm4
+ orps xmm1, xmm7
+ movss xmm7, _ps_sincos_p2
+ mulss xmm0, xmm3
+ mulss xmm4, xmm3
+ movss xmm3, _ps_sincos_p1
+ addss xmm0, xmm7
+ addss xmm4, xmm7
+ movss xmm7, _ps_sincos_p0
+ mulss xmm0, xmm2
+ mulss xmm4, xmm6
+ addss xmm0, xmm3
+ addss xmm4, xmm3
+ movss xmm3, t4
+ mulss xmm0, xmm2
+ mulss xmm4, xmm6
+ orps xmm5, xmm3
+ mov esi, t12
+ addss xmm0, xmm7
+ addss xmm4, xmm7
+ mulss xmm0, xmm1
+ mulss xmm4, xmm5
+
+ // use full stores since caller might reload with full loads
+ movss [eax], xmm0
+ movss [edx], xmm4
+ }
+#elif POSIX
+
+ Assert( "Needs testing, verify impl!\n" );
+
+ v4sf xx = _mm_load_ss( &x );
+
+ v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+ v2si mm0, mm1, mm2, mm3, mm4, mm5;
+ sign_bit_sin = xx;
+ /* take the absolute value */
+ xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+
+ /* store the integer part of y in mm2:mm3 */
+ xmm3 = _mm_movehl_ps(xmm3, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm3);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+ mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+ mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+ mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+ mm4 = mm2;
+ mm5 = mm3;
+
+ /* get the swap sign flag for the sine */
+ mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+ mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+ v4sf swap_sign_bit_sin;
+ COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+ /* get the polynom selection mask for the sine */
+
+ mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+ mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+ v4sf poly_mask;
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+ xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+ xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ xx = _mm_add_ps(xx, xmm1);
+ xx = _mm_add_ps(xx, xmm2);
+ xx = _mm_add_ps(xx, xmm3);
+
+ /* get the sign flag for the cosine */
+ mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+ mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+ mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+ mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+ mm4 = _mm_slli_pi32(mm4, 29);
+ mm5 = _mm_slli_pi32(mm5, 29);
+ v4sf sign_bit_cos;
+ COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+ _mm_empty(); /* good-bye mmx */
+
+ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ v4sf z = _mm_mul_ps(xx,xx);
+ y = *(v4sf*)_ps_coscof_p0;
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, *(v4sf*)_ps_1);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = *(v4sf*)_ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, xx);
+ y2 = _mm_add_ps(y2, xx);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ v4sf ysin2 = _mm_and_ps(xmm3, y2);
+ v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+ y2 = _mm_sub_ps(y2,ysin2);
+ y = _mm_sub_ps(y, ysin1);
+
+ xmm1 = _mm_add_ps(ysin1,ysin2);
+ xmm2 = _mm_add_ps(y,y2);
+
+ /* update the sign */
+ _mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
+ _mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
+
+#else
+ #error "Not Implemented"
+#endif
+}
+
+float _SSE_cos( float x )
+{
+#ifdef _WIN32
+ float temp;
+ __asm
+ {
+ movss xmm0, x
+ movss xmm1, _ps_am_inv_sign_mask
+ andps xmm0, xmm1
+ addss xmm0, _ps_am_pi_o_2
+ mulss xmm0, _ps_am_2_o_pi
+
+ cvttss2si ecx, xmm0
+ movss xmm5, _ps_am_1
+ mov edx, ecx
+ shl edx, (31 - 1)
+ cvtsi2ss xmm1, ecx
+ and edx, 0x80000000
+ and ecx, 0x1
+
+ subss xmm0, xmm1
+ movss xmm6, _sincos_masks[ecx * 4]
+ minss xmm0, xmm5
+
+ movss xmm1, _ps_sincos_p3
+ subss xmm5, xmm0
+
+ andps xmm5, xmm6
+ movss xmm7, _ps_sincos_p2
+ andnps xmm6, xmm0
+ mov temp, edx
+ orps xmm5, xmm6
+ movss xmm0, xmm5
+
+ mulss xmm5, xmm5
+ movss xmm4, _ps_sincos_p1
+ movss xmm2, xmm5
+ mulss xmm5, xmm1
+ movss xmm1, _ps_sincos_p0
+ addss xmm5, xmm7
+ mulss xmm5, xmm2
+ movss xmm3, temp
+ addss xmm5, xmm4
+ mulss xmm5, xmm2
+ orps xmm0, xmm3
+ addss xmm5, xmm1
+ mulss xmm0, xmm5
+
+ movss x, xmm0
+
+ }
+#elif POSIX
+
+ Assert( "Needs testing, verify impl!\n" );
+
+ v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+ v2si mm0, mm1, mm2, mm3;
+ /* take the absolute value */
+ v4sf xx = _mm_load_ss( &x );
+
+ xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm2 = _mm_movehl_ps(xmm2, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm2);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+ mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+ mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+ mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+ mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+ mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+ /* get the swap sign flag in mm0:mm1 and the
+ polynom selection mask in mm2:mm3 */
+
+ mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+ mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+
+ mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+ mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+ v4sf sign_bit, poly_mask;
+ COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+ _mm_empty(); /* good-bye mmx */
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+ xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+ xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ xx = _mm_add_ps(xx, xmm1);
+ xx = _mm_add_ps(xx, xmm2);
+ xx = _mm_add_ps(xx, xmm3);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ y = *(v4sf*)_ps_coscof_p0;
+ v4sf z = _mm_mul_ps(xx,xx);
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, *(v4sf*)_ps_1);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = *(v4sf*)_ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, xx);
+ y2 = _mm_add_ps(y2, xx);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+ y = _mm_andnot_ps(xmm3, y);
+ y = _mm_add_ps(y,y2);
+ /* update the sign */
+
+ _mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
+
+#else
+ #error "Not Implemented"
+#endif
+
+ return x;
+}
+
+//-----------------------------------------------------------------------------
+// SSE2 implementations of optimized routines:
+//-----------------------------------------------------------------------------
+void _SSE2_SinCos(float x, float* s, float* c) // any x
+{
+#ifdef _WIN32
+ __asm
+ {
+ movss xmm0, x
+ movaps xmm7, xmm0
+ movss xmm1, _ps_am_inv_sign_mask
+ movss xmm2, _ps_am_sign_mask
+ movss xmm3, _ps_am_2_o_pi
+ andps xmm0, xmm1
+ andps xmm7, xmm2
+ mulss xmm0, xmm3
+
+ pxor xmm3, xmm3
+ movd xmm5, _epi32_1
+ movss xmm4, _ps_am_1
+
+ cvttps2dq xmm2, xmm0
+ pand xmm5, xmm2
+ movd xmm1, _epi32_2
+ pcmpeqd xmm5, xmm3
+ movd xmm3, _epi32_1
+ cvtdq2ps xmm6, xmm2
+ paddd xmm3, xmm2
+ pand xmm2, xmm1
+ pand xmm3, xmm1
+ subss xmm0, xmm6
+ pslld xmm2, (31 - 1)
+ minss xmm0, xmm4
+
+ mov eax, s // mov eax, [esp + 4 + 16]
+ mov edx, c // mov edx, [esp + 4 + 16 + 4]
+
+ subss xmm4, xmm0
+ pslld xmm3, (31 - 1)
+
+ movaps xmm6, xmm4
+ xorps xmm2, xmm7
+ movaps xmm7, xmm5
+ andps xmm6, xmm7
+ andnps xmm7, xmm0
+ andps xmm0, xmm5
+ andnps xmm5, xmm4
+ movss xmm4, _ps_sincos_p3
+ orps xmm6, xmm7
+ orps xmm0, xmm5
+ movss xmm5, _ps_sincos_p2
+
+ movaps xmm1, xmm0
+ movaps xmm7, xmm6
+ mulss xmm0, xmm0
+ mulss xmm6, xmm6
+ orps xmm1, xmm2
+ orps xmm7, xmm3
+ movaps xmm2, xmm0
+ movaps xmm3, xmm6
+ mulss xmm0, xmm4
+ mulss xmm6, xmm4
+ movss xmm4, _ps_sincos_p1
+ addss xmm0, xmm5
+ addss xmm6, xmm5
+ movss xmm5, _ps_sincos_p0
+ mulss xmm0, xmm2
+ mulss xmm6, xmm3
+ addss xmm0, xmm4
+ addss xmm6, xmm4
+ mulss xmm0, xmm2
+ mulss xmm6, xmm3
+ addss xmm0, xmm5
+ addss xmm6, xmm5
+ mulss xmm0, xmm1
+ mulss xmm6, xmm7
+
+ // use full stores since caller might reload with full loads
+ movss [eax], xmm0
+ movss [edx], xmm6
+ }
+#elif POSIX
+ #warning "_SSE2_SinCos NOT implemented!"
+ Assert( 0 );
+#else
+ #error "Not Implemented"
+#endif
+}
+
+float _SSE2_cos(float x)
+{
+#ifdef _WIN32
+ __asm
+ {
+ movss xmm0, x
+ movss xmm1, _ps_am_inv_sign_mask
+ movss xmm2, _ps_am_pi_o_2
+ movss xmm3, _ps_am_2_o_pi
+ andps xmm0, xmm1
+ addss xmm0, xmm2
+ mulss xmm0, xmm3
+
+ pxor xmm3, xmm3
+ movd xmm5, _epi32_1
+ movss xmm4, _ps_am_1
+ cvttps2dq xmm2, xmm0
+ pand xmm5, xmm2
+ movd xmm1, _epi32_2
+ pcmpeqd xmm5, xmm3
+ cvtdq2ps xmm6, xmm2
+ pand xmm2, xmm1
+ pslld xmm2, (31 - 1)
+
+ subss xmm0, xmm6
+ movss xmm3, _ps_sincos_p3
+ minss xmm0, xmm4
+ subss xmm4, xmm0
+ andps xmm0, xmm5
+ andnps xmm5, xmm4
+ orps xmm0, xmm5
+
+ movaps xmm1, xmm0
+ movss xmm4, _ps_sincos_p2
+ mulss xmm0, xmm0
+ movss xmm5, _ps_sincos_p1
+ orps xmm1, xmm2
+ movaps xmm7, xmm0
+ mulss xmm0, xmm3
+ movss xmm6, _ps_sincos_p0
+ addss xmm0, xmm4
+ mulss xmm0, xmm7
+ addss xmm0, xmm5
+ mulss xmm0, xmm7
+ addss xmm0, xmm6
+ mulss xmm0, xmm1
+ movss x, xmm0
+ }
+#elif POSIX
+ #warning "_SSE2_cos NOT implemented!"
+ Assert( 0 );
+#else
+ #error "Not Implemented"
+#endif
+
+ return x;
+}
+
+// SSE Version of VectorTransform
+void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
+{
+ Assert( s_bMathlibInitialized );
+ Assert( in1 != out1 );
+
+#ifdef _WIN32
+ __asm
+ {
+ mov eax, in1;
+ mov ecx, in2;
+ mov edx, out1;
+
+ movss xmm0, [eax];
+ mulss xmm0, [ecx];
+ movss xmm1, [eax+4];
+ mulss xmm1, [ecx+4];
+ movss xmm2, [eax+8];
+ mulss xmm2, [ecx+8];
+ addss xmm0, xmm1;
+ addss xmm0, xmm2;
+ addss xmm0, [ecx+12]
+ movss [edx], xmm0;
+ add ecx, 16;
+
+ movss xmm0, [eax];
+ mulss xmm0, [ecx];
+ movss xmm1, [eax+4];
+ mulss xmm1, [ecx+4];
+ movss xmm2, [eax+8];
+ mulss xmm2, [ecx+8];
+ addss xmm0, xmm1;
+ addss xmm0, xmm2;
+ addss xmm0, [ecx+12]
+ movss [edx+4], xmm0;
+ add ecx, 16;
+
+ movss xmm0, [eax];
+ mulss xmm0, [ecx];
+ movss xmm1, [eax+4];
+ mulss xmm1, [ecx+4];
+ movss xmm2, [eax+8];
+ mulss xmm2, [ecx+8];
+ addss xmm0, xmm1;
+ addss xmm0, xmm2;
+ addss xmm0, [ecx+12]
+ movss [edx+8], xmm0;
+ }
+#elif POSIX
+ #warning "VectorTransformSSE C implementation only"
+ out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
+ out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
+ out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
+#else
+ #error "Not Implemented"
+#endif
+}
+
+void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
+{
+ Assert( s_bMathlibInitialized );
+ Assert( in1 != out1 );
+
+#ifdef _WIN32
+ __asm
+ {
+ mov eax, in1;
+ mov ecx, in2;
+ mov edx, out1;
+
+ movss xmm0, [eax];
+ mulss xmm0, [ecx];
+ movss xmm1, [eax+4];
+ mulss xmm1, [ecx+4];
+ movss xmm2, [eax+8];
+ mulss xmm2, [ecx+8];
+ addss xmm0, xmm1;
+ addss xmm0, xmm2;
+ movss [edx], xmm0;
+ add ecx, 16;
+
+ movss xmm0, [eax];
+ mulss xmm0, [ecx];
+ movss xmm1, [eax+4];
+ mulss xmm1, [ecx+4];
+ movss xmm2, [eax+8];
+ mulss xmm2, [ecx+8];
+ addss xmm0, xmm1;
+ addss xmm0, xmm2;
+ movss [edx+4], xmm0;
+ add ecx, 16;
+
+ movss xmm0, [eax];
+ mulss xmm0, [ecx];
+ movss xmm1, [eax+4];
+ mulss xmm1, [ecx+4];
+ movss xmm2, [eax+8];
+ mulss xmm2, [ecx+8];
+ addss xmm0, xmm1;
+ addss xmm0, xmm2;
+ movss [edx+8], xmm0;
+ }
+#elif POSIX
+ #warning "VectorRotateSSE C implementation only"
+ out1[0] = DotProduct( in1, in2[0] );
+ out1[1] = DotProduct( in1, in2[1] );
+ out1[2] = DotProduct( in1, in2[2] );
+#else
+ #error "Not Implemented"
+#endif
+}
+
+#ifdef _WIN32
+void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
+{
+ // FIXME: This don't work!! It will overwrite memory in the write to dest
+ Assert(0);
+
+ Assert( s_bMathlibInitialized );
+ _asm { // Intel SSE only routine
+ mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
+ mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
+ mov edx, DWORD PTR [esp+0x10] ; *dest
+ movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+ movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
+ pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
+ movaps xmm1, [eax] ; x1 = start1, start2, start3, X
+ mulps xmm3, xmm2 ; x3 *= x2
+ addps xmm3, xmm1 ; x3 += x1
+ movaps [edx], xmm3 ; *dest = x3
+#else
+ movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
+ pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
+ movups xmm1, [eax] ; x1 = start1, start2, start3, X
+ mulps xmm3, xmm2 ; x3 *= x2
+ addps xmm3, xmm1 ; x3 += x1
+ movups [edx], xmm3 ; *dest = x3
+#endif
+ }
+}
+#endif
+
+#ifdef _WIN32
+#ifdef PFN_VECTORMA
+void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
+{
+ // FIXME: This don't work!! It will overwrite memory in the write to dest
+ Assert(0);
+
+ Assert( s_bMathlibInitialized );
+ _asm
+ {
+ // Intel SSE only routine
+ mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
+ mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
+ mov edx, DWORD PTR [esp+0x10] ; *dest
+ movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+ movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
+ pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
+ movaps xmm1, [eax] ; x1 = start1, start2, start3, X
+ mulps xmm3, xmm2 ; x3 *= x2
+ addps xmm3, xmm1 ; x3 += x1
+ movaps [edx], xmm3 ; *dest = x3
+#else
+ movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
+ pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
+ movups xmm1, [eax] ; x1 = start1, start2, start3, X
+ mulps xmm3, xmm2 ; x3 *= x2
+ addps xmm3, xmm1 ; x3 += x1
+ movups [edx], xmm3 ; *dest = x3
+#endif
+ }
+}
+float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
+#endif
+#endif
+
+
+// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
+// Should be validated too! :)
+// NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like
+// this, but when inlined, and instruction scheduled, the C version is faster.
+// Verified this via VTune
+/*
+vec_t DotProduct (const vec_t *a, const vec_t *c)
+{
+ vec_t temp;
+
+ __asm
+ {
+ mov eax, a;
+ mov ecx, c;
+ mov edx, DWORD PTR [temp]
+ movss xmm0, [eax];
+ mulss xmm0, [ecx];
+ movss xmm1, [eax+4];
+ mulss xmm1, [ecx+4];
+ movss xmm2, [eax+8];
+ mulss xmm2, [ecx+8];
+ addss xmm0, xmm1;
+ addss xmm0, xmm2;
+ movss [edx], xmm0;
+ fld DWORD PTR [edx];
+ ret
+ }
+}
+*/
+
+#endif // COMPILER_MSVC64