1 files changed, 1095 insertions, 0 deletions
diff --git a/mp/src/mathlib/sse.cpp b/mp/src/mathlib/sse.cpp
new file mode 100644
index 00000000..6e41683f
--- /dev/null
+++ b/mp/src/mathlib/sse.cpp
@@ -0,0 +1,1095 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: SSE Math primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "sse.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+#ifndef COMPILER_MSVC64
+// Implement for 64-bit Windows if needed.
+
+static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
+static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
+
+//-----------------------------------------------------------------------------
+// Macros and constants required by some of the SSE assembly:
+//-----------------------------------------------------------------------------
+
+#ifdef _WIN32
+	#define _PS_EXTERN_CONST(Name, Val) \
+		const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+		const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+
+	#define _EPI32_CONST(Name, Val) \
+		static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_CONST(Name, Val) \
+		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+#elif POSIX
+	#define _PS_EXTERN_CONST(Name, Val) \
+		const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
+
+	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+		const Type _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
+
+	#define _EPI32_CONST(Name, Val) \
+		static const int32 _epi32_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
+
+	#define _PS_CONST(Name, Val) \
+		static const float _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
+#endif
+
+_PS_EXTERN_CONST(am_0, 0.0f);
+_PS_EXTERN_CONST(am_1, 1.0f);
+_PS_EXTERN_CONST(am_m1, -1.0f);
+_PS_EXTERN_CONST(am_0p5, 0.5f);
+_PS_EXTERN_CONST(am_1p5, 1.5f);
+_PS_EXTERN_CONST(am_pi, (float)M_PI);
+_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
+_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
+_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
+_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
+_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
+_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
+_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
+_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
+_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
+
+_EPI32_CONST(1, 1);
+_EPI32_CONST(2, 2);
+
+_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
+_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
+_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
+_PS_CONST(sincos_p3, -0.468175413106023168e-2f);
+
+#ifdef PFN_VECTORMA
+void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
+#endif
+
+//-----------------------------------------------------------------------------
+// SSE implementations of optimized routines:
+//-----------------------------------------------------------------------------
+float _SSE_Sqrt(float x)
+{
+	Assert( s_bMathlibInitialized );
+	float	root = 0.f;
+#ifdef _WIN32
+	_asm
+	{
+		sqrtss		xmm0, x
+		movss		root, xmm0
+	}
+#elif POSIX
+	_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
+#endif
+	return root;
+}
+
+// Single iteration NewtonRaphson reciprocal square root:
+// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
+// Very low error, and fine to use in place of 1.f / sqrtf(x).	
+#if 0
+float _SSE_RSqrtAccurate(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	float rroot;
+	_asm
+	{
+		rsqrtss	xmm0, x
+		movss	rroot, xmm0
+	}
+
+	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
+}
+#else
+
+#ifdef POSIX
+const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
+const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
+#endif
+
+// Intel / Kipps SSE RSqrt.  Significantly faster than above.
+float _SSE_RSqrtAccurate(float a)
+{
+
+#ifdef _WIN32
+	float x;
+	float half = 0.5f;
+	float three = 3.f;
+
+	__asm
+	{
+		movss   xmm3, a;
+		movss   xmm1, half;
+		movss   xmm2, three;
+		rsqrtss xmm0, xmm3;
+
+		mulss   xmm3, xmm0;
+		mulss   xmm1, xmm0;
+		mulss   xmm3, xmm0;
+		subss   xmm2, xmm3;
+		mulss   xmm1, xmm2;
+
+		movss   x,    xmm1;
+	}
+
+	return x;
+#elif POSIX	
+	__m128  xx = _mm_load_ss( &a );
+    __m128  xr = _mm_rsqrt_ss( xx );
+    __m128  xt;
+	
+    xt = _mm_mul_ss( xr, xr );
+    xt = _mm_mul_ss( xt, xx );
+    xt = _mm_sub_ss( f3, xt );
+    xt = _mm_mul_ss( xt, f05 );
+    xr = _mm_mul_ss( xr, xt );
+	
+    _mm_store_ss( &a, xr );
+    return a;
+#else
+	#error "Not Implemented"
+#endif
+
+}
+#endif
+
+// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
+// or so, so ok for closed transforms.  (ie, computing lighting normals)
+float _SSE_RSqrtFast(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	float rroot;
+#ifdef _WIN32
+	_asm
+	{
+		rsqrtss	xmm0, x
+		movss	rroot, xmm0
+	}
+#elif POSIX
+	__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
+#else
+#error
+#endif
+
+	return rroot;
+}
+
+float FASTCALL _SSE_VectorNormalize (Vector& vec)
+{
+	Assert( s_bMathlibInitialized );
+
+	// NOTE: This is necessary to prevent an memory overwrite...
+	// sice vec only has 3 floats, we can't "movaps" directly into it.
+#ifdef _WIN32
+	__declspec(align(16)) float result[4];
+#elif POSIX
+	 float result[4] __attribute__((aligned(16)));
+#endif
+
+	float *v = &vec[0];
+	float *r = &result[0];
+
+	float	radius = 0.f;
+	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
+	// be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
+	if ( v[0] || v[1] || v[2] )
+	{
+#ifdef _WIN32
+	_asm
+		{
+			mov			eax, v
+			mov			edx, r
+#ifdef ALIGNED_VECTOR
+			movaps		xmm4, [eax]			// r4 = vx, vy, vz, X
+			movaps		xmm1, xmm4			// r1 = r4
+#else
+			movups		xmm4, [eax]			// r4 = vx, vy, vz, X
+			movaps		xmm1, xmm4			// r1 = r4
+#endif
+			mulps		xmm1, xmm4			// r1 = vx * vx, vy * vy, vz * vz, X
+			movhlps		xmm3, xmm1			// r3 = vz * vz, X, X, X
+			movaps		xmm2, xmm1			// r2 = r1
+			shufps		xmm2, xmm2, 1		// r2 = vy * vy, X, X, X
+			addss		xmm1, xmm2			// r1 = (vx * vx) + (vy * vy), X, X, X
+			addss		xmm1, xmm3			// r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+			sqrtss		xmm1, xmm1			// r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
+			movss		radius, xmm1		// radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
+			rcpss		xmm1, xmm1			// r1 = 1/radius, X, X, X
+			shufps		xmm1, xmm1, 0		// r1 = 1/radius, 1/radius, 1/radius, X
+			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+		}
+#elif POSIX
+		__asm__ __volatile__(
+#ifdef ALIGNED_VECTOR
+            "movaps          %2, %%xmm4 \n\t"
+            "movaps          %%xmm4, %%xmm1 \n\t"
+#else
+            "movups          %2, %%xmm4 \n\t"
+            "movaps          %%xmm4, %%xmm1 \n\t"
+#endif
+            "mulps           %%xmm4, %%xmm1 \n\t"
+            "movhlps         %%xmm1, %%xmm3 \n\t"
+            "movaps          %%xmm1, %%xmm2 \n\t"
+            "shufps          $1, %%xmm2, %%xmm2 \n\t"
+            "addss           %%xmm2, %%xmm1 \n\t"
+            "addss           %%xmm3, %%xmm1 \n\t"
+            "sqrtss          %%xmm1, %%xmm1 \n\t"
+            "movss           %%xmm1, %0 \n\t"
+            "rcpss           %%xmm1, %%xmm1 \n\t"
+            "shufps          $0, %%xmm1, %%xmm1 \n\t"
+            "mulps           %%xmm1, %%xmm4 \n\t"
+            "movaps          %%xmm4, %1 \n\t"
+            : "=m" (radius), "=m" (result)
+            : "m" (*v)
+ 		);
+#else
+	#error "Not Implemented"
+#endif
+		vec.x = result[0];
+		vec.y = result[1];
+		vec.z = result[2];
+
+	}
+
+	return radius;
+}
+
+void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
+{
+	float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
+
+	vec.x *= ool;
+	vec.y *= ool;
+	vec.z *= ool;
+}
+
+float _SSE_InvRSquared(const float* v)
+{
+	float	inv_r2 = 1.f;
+#ifdef _WIN32
+	_asm { // Intel SSE only routine
+		mov			eax, v
+		movss		xmm5, inv_r2		// x5 = 1.0, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps		xmm4, [eax]			// x4 = vx, vy, vz, X
+#else
+		movups		xmm4, [eax]			// x4 = vx, vy, vz, X
+#endif
+		movaps		xmm1, xmm4			// x1 = x4
+		mulps		xmm1, xmm4			// x1 = vx * vx, vy * vy, vz * vz, X
+		movhlps		xmm3, xmm1			// x3 = vz * vz, X, X, X
+		movaps		xmm2, xmm1			// x2 = x1
+		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
+		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
+		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
+		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
+		movss		inv_r2, xmm0		// inv_r2 = x0
+	}
+#elif POSIX
+		__asm__ __volatile__(
+		"movss			 %0, %%xmm5 \n\t"
+#ifdef ALIGNED_VECTOR
+		"movaps          %1, %%xmm4 \n\t"
+#else
+		"movups          %1, %%xmm4 \n\t"
+#endif
+        "movaps          %%xmm4, %%xmm1 \n\t"
+        "mulps           %%xmm4, %%xmm1 \n\t"
+		"movhlps         %%xmm1, %%xmm3 \n\t"
+		"movaps          %%xmm1, %%xmm2 \n\t"
+        "shufps          $1, %%xmm2, %%xmm2 \n\t"
+        "addss           %%xmm2, %%xmm1 \n\t"
+        "addss           %%xmm3, %%xmm1 \n\t"
+		"maxss           %%xmm5, %%xmm1 \n\t"
+        "rcpss           %%xmm1, %%xmm0 \n\t"
+		"movss           %%xmm0, %0 \n\t" 
+        : "=m" (inv_r2)
+        : "m" (*v), "0" (inv_r2)
+ 		);
+#else
+	#error "Not Implemented"
+#endif
+
+	return inv_r2;
+}
+
+
+#ifdef POSIX
+// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+
+#define _PI32_CONST(Name, Val)  static const ALIGN16 int _pi32_##Name[4]  ALIGN16_POST = { Val, Val, Val, Val }
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+typedef union xmm_mm_union {
+	__m128 xmm;
+	__m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
+
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+typedef __m64 v2si;   // vector of 2 int (mmx)
+
+#endif
+
+void _SSE_SinCos(float x, float* s, float* c)
+{
+#ifdef _WIN32
+	float t4, t8, t12;
+
+	__asm
+	{
+		movss	xmm0, x
+		movss	t12, xmm0
+		movss	xmm1, _ps_am_inv_sign_mask
+		mov		eax, t12
+		mulss	xmm0, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		and		eax, 0x80000000
+
+		cvttss2si	edx, xmm0
+		mov		ecx, edx
+		mov		t12, esi
+		mov		esi, edx
+		add		edx, 0x1	
+		shl		ecx, (31 - 1)
+		shl		edx, (31 - 1)
+
+		movss	xmm4, _ps_am_1
+		cvtsi2ss	xmm3, esi
+		mov		t8, eax
+		and		esi, 0x1
+
+		subss	xmm0, xmm3
+		movss	xmm3, _sincos_inv_masks[esi * 4]
+		minss	xmm0, xmm4
+
+		subss	xmm4, xmm0
+
+		movss	xmm6, xmm4
+		andps	xmm4, xmm3
+		and		ecx, 0x80000000
+		movss	xmm2, xmm3
+		andnps	xmm3, xmm0
+		and		edx, 0x80000000
+		movss	xmm7, t8
+		andps	xmm0, xmm2
+		mov		t8, ecx
+		mov		t4, edx
+		orps	xmm4, xmm3
+
+		mov		eax, s     //mov eax, [esp + 4 + 16]
+		mov		edx, c //mov edx, [esp + 4 + 16 + 4]
+
+		andnps	xmm2, xmm6
+		orps	xmm0, xmm2
+
+		movss	xmm2, t8
+		movss	xmm1, xmm0
+		movss	xmm5, xmm4
+		xorps	xmm7, xmm2
+		movss	xmm3, _ps_sincos_p3
+		mulss	xmm0, xmm0
+		mulss	xmm4, xmm4
+		movss	xmm2, xmm0
+		movss	xmm6, xmm4
+		orps	xmm1, xmm7
+		movss	xmm7, _ps_sincos_p2
+		mulss	xmm0, xmm3
+		mulss	xmm4, xmm3
+		movss	xmm3, _ps_sincos_p1
+		addss	xmm0, xmm7
+		addss	xmm4, xmm7
+		movss	xmm7, _ps_sincos_p0
+		mulss	xmm0, xmm2
+		mulss	xmm4, xmm6
+		addss	xmm0, xmm3
+		addss	xmm4, xmm3
+		movss	xmm3, t4
+		mulss	xmm0, xmm2
+		mulss	xmm4, xmm6
+		orps	xmm5, xmm3
+		mov		esi, t12
+		addss	xmm0, xmm7
+		addss	xmm4, xmm7
+		mulss	xmm0, xmm1
+		mulss	xmm4, xmm5
+
+		// use full stores since caller might reload with full loads
+		movss	[eax], xmm0
+		movss	[edx], xmm4
+	}
+#elif POSIX
+	
+	Assert( "Needs testing, verify impl!\n" );
+	
+	v4sf  xx = _mm_load_ss( &x );
+	
+	v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+	v2si mm0, mm1, mm2, mm3, mm4, mm5;
+	sign_bit_sin = xx;
+	/* take the absolute value */
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+	/* extract the sign bit (upper one) */
+	sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+	
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm2:mm3 */
+	xmm3 = _mm_movehl_ps(xmm3, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm3);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	mm4 = mm2;
+	mm5 = mm3;
+	
+	/* get the swap sign flag for the sine */
+	mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	v4sf swap_sign_bit_sin;
+	COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+	
+	/* get the polynom selection mask for the sine */
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	v4sf poly_mask;
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* get the sign flag for the cosine */
+	mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+	mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+	mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+	mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+	mm4 = _mm_slli_pi32(mm4, 29);
+	mm5 = _mm_slli_pi32(mm5, 29);
+	v4sf sign_bit_cos;
+	COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+	_mm_empty(); /* good-bye mmx */
+	
+	sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+	
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	v4sf z = _mm_mul_ps(xx,xx);
+	y = *(v4sf*)_ps_coscof_p0;
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	v4sf ysin2 = _mm_and_ps(xmm3, y2);
+	v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+	y2 = _mm_sub_ps(y2,ysin2);
+	y = _mm_sub_ps(y, ysin1);
+	
+	xmm1 = _mm_add_ps(ysin1,ysin2);
+	xmm2 = _mm_add_ps(y,y2);
+	
+	/* update the sign */
+	_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
+	_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
+
+#else
+	#error "Not Implemented"
+#endif
+}
+
+float _SSE_cos( float x )
+{
+#ifdef _WIN32
+	float temp;
+	__asm
+	{
+		movss	xmm0, x
+		movss	xmm1, _ps_am_inv_sign_mask
+		andps	xmm0, xmm1
+		addss	xmm0, _ps_am_pi_o_2
+		mulss	xmm0, _ps_am_2_o_pi
+
+		cvttss2si	ecx, xmm0
+		movss	xmm5, _ps_am_1
+		mov		edx, ecx
+		shl		edx, (31 - 1)
+		cvtsi2ss	xmm1, ecx
+		and		edx, 0x80000000
+		and		ecx, 0x1
+
+		subss	xmm0, xmm1
+		movss	xmm6, _sincos_masks[ecx * 4]
+		minss	xmm0, xmm5
+
+		movss	xmm1, _ps_sincos_p3
+		subss	xmm5, xmm0
+
+		andps	xmm5, xmm6
+		movss	xmm7, _ps_sincos_p2
+		andnps	xmm6, xmm0
+		mov		temp, edx
+		orps	xmm5, xmm6
+		movss	xmm0, xmm5
+
+		mulss	xmm5, xmm5
+		movss	xmm4, _ps_sincos_p1
+		movss	xmm2, xmm5
+		mulss	xmm5, xmm1
+		movss	xmm1, _ps_sincos_p0
+		addss	xmm5, xmm7
+		mulss	xmm5, xmm2
+		movss	xmm3, temp
+		addss	xmm5, xmm4
+		mulss	xmm5, xmm2
+		orps	xmm0, xmm3
+		addss	xmm5, xmm1
+		mulss	xmm0, xmm5
+		
+		movss   x,    xmm0
+
+	}
+#elif POSIX
+
+	Assert( "Needs testing, verify impl!\n" );
+
+	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+	v2si mm0, mm1, mm2, mm3;
+	/* take the absolute value */
+	v4sf  xx = _mm_load_ss( &x );
+
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+		
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm0:mm1 */
+	xmm2 = _mm_movehl_ps(xmm2, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm2);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	
+	mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+	
+	/* get the swap sign flag in mm0:mm1 and the 
+	 polynom selection mask in mm2:mm3 */
+	
+	mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	
+	v4sf sign_bit, poly_mask;
+	COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	_mm_empty(); /* good-bye mmx */
+
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	y = *(v4sf*)_ps_coscof_p0;
+	v4sf z = _mm_mul_ps(xx,xx);
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+	y = _mm_andnot_ps(xmm3, y);
+	y = _mm_add_ps(y,y2);
+	/* update the sign */
+
+	_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
+
+#else
+	#error "Not Implemented"
+#endif
+
+	return x;
+}
+
+//-----------------------------------------------------------------------------
+// SSE2 implementations of optimized routines:
+//-----------------------------------------------------------------------------
+void _SSE2_SinCos(float x, float* s, float* c)  // any x
+{
+#ifdef _WIN32
+	__asm
+	{
+		movss	xmm0, x
+		movaps	xmm7, xmm0
+		movss	xmm1, _ps_am_inv_sign_mask
+		movss	xmm2, _ps_am_sign_mask
+		movss	xmm3, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		andps	xmm7, xmm2
+		mulss	xmm0, xmm3
+
+		pxor	xmm3, xmm3
+		movd	xmm5, _epi32_1
+		movss	xmm4, _ps_am_1
+
+		cvttps2dq	xmm2, xmm0
+		pand	xmm5, xmm2
+		movd	xmm1, _epi32_2
+		pcmpeqd	xmm5, xmm3
+		movd	xmm3, _epi32_1
+		cvtdq2ps	xmm6, xmm2
+		paddd	xmm3, xmm2
+		pand	xmm2, xmm1
+		pand	xmm3, xmm1
+		subss	xmm0, xmm6
+		pslld	xmm2, (31 - 1)
+		minss	xmm0, xmm4
+
+		mov		eax, s     // mov eax, [esp + 4 + 16]
+		mov		edx, c	   // mov edx, [esp + 4 + 16 + 4]
+
+		subss	xmm4, xmm0
+		pslld	xmm3, (31 - 1)
+
+		movaps	xmm6, xmm4
+		xorps	xmm2, xmm7
+		movaps	xmm7, xmm5
+		andps	xmm6, xmm7
+		andnps	xmm7, xmm0
+		andps	xmm0, xmm5
+		andnps	xmm5, xmm4
+		movss	xmm4, _ps_sincos_p3
+		orps	xmm6, xmm7
+		orps	xmm0, xmm5
+		movss	xmm5, _ps_sincos_p2
+
+		movaps	xmm1, xmm0
+		movaps	xmm7, xmm6
+		mulss	xmm0, xmm0
+		mulss	xmm6, xmm6
+		orps	xmm1, xmm2
+		orps	xmm7, xmm3
+		movaps	xmm2, xmm0
+		movaps	xmm3, xmm6
+		mulss	xmm0, xmm4
+		mulss	xmm6, xmm4
+		movss	xmm4, _ps_sincos_p1
+		addss	xmm0, xmm5
+		addss	xmm6, xmm5
+		movss	xmm5, _ps_sincos_p0
+		mulss	xmm0, xmm2
+		mulss	xmm6, xmm3
+		addss	xmm0, xmm4
+		addss	xmm6, xmm4
+		mulss	xmm0, xmm2
+		mulss	xmm6, xmm3
+		addss	xmm0, xmm5
+		addss	xmm6, xmm5
+		mulss	xmm0, xmm1
+		mulss	xmm6, xmm7
+
+		// use full stores since caller might reload with full loads
+		movss	[eax], xmm0
+		movss	[edx], xmm6
+	}
+#elif POSIX
+	#warning "_SSE2_SinCos NOT implemented!"
+	Assert( 0 );
+#else
+	#error "Not Implemented"
+#endif
+}
+
+float _SSE2_cos(float x)  
+{
+#ifdef _WIN32
+	__asm
+	{
+		movss	xmm0, x
+		movss	xmm1, _ps_am_inv_sign_mask
+		movss	xmm2, _ps_am_pi_o_2
+		movss	xmm3, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		addss	xmm0, xmm2
+		mulss	xmm0, xmm3
+
+		pxor	xmm3, xmm3
+		movd	xmm5, _epi32_1
+		movss	xmm4, _ps_am_1
+		cvttps2dq	xmm2, xmm0
+		pand	xmm5, xmm2
+		movd	xmm1, _epi32_2
+		pcmpeqd	xmm5, xmm3
+		cvtdq2ps	xmm6, xmm2
+		pand	xmm2, xmm1
+		pslld	xmm2, (31 - 1)
+
+		subss	xmm0, xmm6
+		movss	xmm3, _ps_sincos_p3
+		minss	xmm0, xmm4
+		subss	xmm4, xmm0
+		andps	xmm0, xmm5
+		andnps	xmm5, xmm4
+		orps	xmm0, xmm5
+
+		movaps	xmm1, xmm0
+		movss	xmm4, _ps_sincos_p2
+		mulss	xmm0, xmm0
+		movss	xmm5, _ps_sincos_p1
+		orps	xmm1, xmm2
+		movaps	xmm7, xmm0
+		mulss	xmm0, xmm3
+		movss	xmm6, _ps_sincos_p0
+		addss	xmm0, xmm4
+		mulss	xmm0, xmm7
+		addss	xmm0, xmm5
+		mulss	xmm0, xmm7
+		addss	xmm0, xmm6
+		mulss	xmm0, xmm1
+		movss   x,    xmm0
+	}
+#elif POSIX
+	#warning "_SSE2_cos NOT implemented!"
+	Assert( 0 );
+#else
+	#error "Not Implemented"
+#endif
+
+	return x;
+}
+
+// SSE Version of VectorTransform
+void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out1 );
+
+#ifdef _WIN32
+	__asm
+	{
+		mov eax, in1;
+		mov ecx, in2;
+		mov edx, out1;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+ 		movss [edx], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+		movss [edx+4], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+		movss [edx+8], xmm0;
+	}
+#elif POSIX
+	#warning "VectorTransformSSE C implementation only"
+		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
+		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
+		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
+#else
+	#error "Not Implemented"
+#endif
+}
+
+void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out1 );
+
+#ifdef _WIN32
+	__asm
+	{
+		mov eax, in1;
+		mov ecx, in2;
+		mov edx, out1;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+ 		movss [edx], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx+4], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx+8], xmm0;
+	}
+#elif POSIX
+	#warning "VectorRotateSSE C implementation only"
+		out1[0] = DotProduct( in1, in2[0] );
+		out1[1] = DotProduct( in1, in2[1] );
+		out1[2] = DotProduct( in1, in2[2] );
+#else
+	#error "Not Implemented"
+#endif
+}
+
+#ifdef _WIN32
+void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
+{
+	// FIXME: This don't work!! It will overwrite memory in the write to dest
+	Assert(0);
+
+	Assert( s_bMathlibInitialized );
+	_asm {  // Intel SSE only routine
+		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
+		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
+		mov edx, DWORD PTR [esp+0x10]	; *dest
+		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movaps	[edx], xmm3				; *dest = x3
+#else
+		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movups	[edx], xmm3				; *dest = x3
+#endif
+	}
+}
+#endif
+
+#ifdef _WIN32
+#ifdef PFN_VECTORMA
+void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
+{
+	// FIXME: This don't work!! It will overwrite memory in the write to dest
+	Assert(0);
+
+	Assert( s_bMathlibInitialized );
+	_asm 
+	{  
+		// Intel SSE only routine
+		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
+		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
+		mov edx, DWORD PTR [esp+0x10]	; *dest
+		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movaps	[edx], xmm3				; *dest = x3
+#else
+		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movups	[edx], xmm3				; *dest = x3
+#endif
+	}
+}
+float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
+#endif
+#endif
+
+
+// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
+//   Should be validated too!  :)
+//   NJS: (Nov 1 2002) -NOT- faster.  may time a couple cycles faster in a single function like 
+//   this, but when inlined, and instruction scheduled, the C version is faster.  
+//   Verified this via VTune
+/*
+vec_t DotProduct (const vec_t *a, const vec_t *c)
+{
+	vec_t temp;
+
+	__asm
+	{
+		mov eax, a;
+		mov ecx, c;
+		mov edx, DWORD PTR [temp]
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx], xmm0;
+		fld DWORD PTR [edx];
+		ret
+	}
+}
+*/
+
+#endif // COMPILER_MSVC64