From f56bb35301836e56582a575a75864392a0177875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20P=2E=20Tjern=C3=B8?= Date: Mon, 2 Dec 2013 19:31:46 -0800 Subject: Fix line endings. WHAMMY. --- mp/src/mathlib/sse.cpp | 2190 ++++++++++++++++++++++++------------------------ 1 file changed, 1095 insertions(+), 1095 deletions(-) (limited to 'mp/src/mathlib/sse.cpp') diff --git a/mp/src/mathlib/sse.cpp b/mp/src/mathlib/sse.cpp index 6e41683f..a2445c79 100644 --- a/mp/src/mathlib/sse.cpp +++ b/mp/src/mathlib/sse.cpp @@ -1,1095 +1,1095 @@ -//========= Copyright Valve Corporation, All rights reserved. ============// -// -// Purpose: SSE Math primitives. -// -//=====================================================================================// - -#include -#include // Needed for FLT_EPSILON -#include "basetypes.h" -#include -#include "tier0/dbg.h" -#include "mathlib/mathlib.h" -#include "mathlib/vector.h" -#include "sse.h" - -// memdbgon must be the last include file in a .cpp file!!! -#include "tier0/memdbgon.h" - -#ifndef COMPILER_MSVC64 -// Implement for 64-bit Windows if needed. - -static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 }; -static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 }; - -//----------------------------------------------------------------------------- -// Macros and constants required by some of the SSE assembly: -//----------------------------------------------------------------------------- - -#ifdef _WIN32 - #define _PS_EXTERN_CONST(Name, Val) \ - const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } - - #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ - const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \ - - #define _EPI32_CONST(Name, Val) \ - static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val } - - #define _PS_CONST(Name, Val) \ - static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } -#elif POSIX - #define _PS_EXTERN_CONST(Name, Val) \ - const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } - - #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ - const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \ - - #define _EPI32_CONST(Name, Val) \ - static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } - - #define _PS_CONST(Name, Val) \ - static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } -#endif - -_PS_EXTERN_CONST(am_0, 0.0f); -_PS_EXTERN_CONST(am_1, 1.0f); -_PS_EXTERN_CONST(am_m1, -1.0f); -_PS_EXTERN_CONST(am_0p5, 0.5f); -_PS_EXTERN_CONST(am_1p5, 1.5f); -_PS_EXTERN_CONST(am_pi, (float)M_PI); -_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0)); -_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI)); -_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0)); -_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI)); -_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000); -_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000); -_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000); -_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000); -_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000); - -_EPI32_CONST(1, 1); -_EPI32_CONST(2, 2); - -_PS_CONST(sincos_p0, 0.15707963267948963959e1f); -_PS_CONST(sincos_p1, -0.64596409750621907082e0f); -_PS_CONST(sincos_p2, 0.7969262624561800806e-1f); -_PS_CONST(sincos_p3, -0.468175413106023168e-2f); - -#ifdef PFN_VECTORMA -void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ); -#endif - -//----------------------------------------------------------------------------- -// SSE implementations of optimized routines: -//----------------------------------------------------------------------------- -float _SSE_Sqrt(float x) -{ - Assert( s_bMathlibInitialized ); - float root = 0.f; -#ifdef _WIN32 - _asm - { - sqrtss xmm0, x - movss root, xmm0 - } -#elif POSIX - _mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) ); -#endif - return root; -} - -// Single iteration NewtonRaphson reciprocal square root: -// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) -// Very low error, and fine to use in place of 1.f / sqrtf(x). -#if 0 -float _SSE_RSqrtAccurate(float x) -{ - Assert( s_bMathlibInitialized ); - - float rroot; - _asm - { - rsqrtss xmm0, x - movss rroot, xmm0 - } - - return (0.5f * rroot) * (3.f - (x * rroot) * rroot); -} -#else - -#ifdef POSIX -const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value -const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value -#endif - -// Intel / Kipps SSE RSqrt. Significantly faster than above. -float _SSE_RSqrtAccurate(float a) -{ - -#ifdef _WIN32 - float x; - float half = 0.5f; - float three = 3.f; - - __asm - { - movss xmm3, a; - movss xmm1, half; - movss xmm2, three; - rsqrtss xmm0, xmm3; - - mulss xmm3, xmm0; - mulss xmm1, xmm0; - mulss xmm3, xmm0; - subss xmm2, xmm3; - mulss xmm1, xmm2; - - movss x, xmm1; - } - - return x; -#elif POSIX - __m128 xx = _mm_load_ss( &a ); - __m128 xr = _mm_rsqrt_ss( xx ); - __m128 xt; - - xt = _mm_mul_ss( xr, xr ); - xt = _mm_mul_ss( xt, xx ); - xt = _mm_sub_ss( f3, xt ); - xt = _mm_mul_ss( xt, f05 ); - xr = _mm_mul_ss( xr, xt ); - - _mm_store_ss( &a, xr ); - return a; -#else - #error "Not Implemented" -#endif - -} -#endif - -// Simple SSE rsqrt. Usually accurate to around 6 (relative) decimal places -// or so, so ok for closed transforms. (ie, computing lighting normals) -float _SSE_RSqrtFast(float x) -{ - Assert( s_bMathlibInitialized ); - - float rroot; -#ifdef _WIN32 - _asm - { - rsqrtss xmm0, x - movss rroot, xmm0 - } -#elif POSIX - __asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) ); -#else -#error -#endif - - return rroot; -} - -float FASTCALL _SSE_VectorNormalize (Vector& vec) -{ - Assert( s_bMathlibInitialized ); - - // NOTE: This is necessary to prevent an memory overwrite... - // sice vec only has 3 floats, we can't "movaps" directly into it. -#ifdef _WIN32 - __declspec(align(16)) float result[4]; -#elif POSIX - float result[4] __attribute__((aligned(16))); -#endif - - float *v = &vec[0]; - float *r = &result[0]; - - float radius = 0.f; - // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't - // be much of a performance win, considering you will very likely miss 3 branch predicts in a row. - if ( v[0] || v[1] || v[2] ) - { -#ifdef _WIN32 - _asm - { - mov eax, v - mov edx, r -#ifdef ALIGNED_VECTOR - movaps xmm4, [eax] // r4 = vx, vy, vz, X - movaps xmm1, xmm4 // r1 = r4 -#else - movups xmm4, [eax] // r4 = vx, vy, vz, X - movaps xmm1, xmm4 // r1 = r4 -#endif - mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X - movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X - movaps xmm2, xmm1 // r2 = r1 - shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X - addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X - addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X - sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X - movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz)) - rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X - shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X - mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X - movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X - } -#elif POSIX - __asm__ __volatile__( -#ifdef ALIGNED_VECTOR - "movaps %2, %%xmm4 \n\t" - "movaps %%xmm4, %%xmm1 \n\t" -#else - "movups %2, %%xmm4 \n\t" - "movaps %%xmm4, %%xmm1 \n\t" -#endif - "mulps %%xmm4, %%xmm1 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movaps %%xmm1, %%xmm2 \n\t" - "shufps $1, %%xmm2, %%xmm2 \n\t" - "addss %%xmm2, %%xmm1 \n\t" - "addss %%xmm3, %%xmm1 \n\t" - "sqrtss %%xmm1, %%xmm1 \n\t" - "movss %%xmm1, %0 \n\t" - "rcpss %%xmm1, %%xmm1 \n\t" - "shufps $0, %%xmm1, %%xmm1 \n\t" - "mulps %%xmm1, %%xmm4 \n\t" - "movaps %%xmm4, %1 \n\t" - : "=m" (radius), "=m" (result) - : "m" (*v) - ); -#else - #error "Not Implemented" -#endif - vec.x = result[0]; - vec.y = result[1]; - vec.z = result[2]; - - } - - return radius; -} - -void FASTCALL _SSE_VectorNormalizeFast (Vector& vec) -{ - float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z ); - - vec.x *= ool; - vec.y *= ool; - vec.z *= ool; -} - -float _SSE_InvRSquared(const float* v) -{ - float inv_r2 = 1.f; -#ifdef _WIN32 - _asm { // Intel SSE only routine - mov eax, v - movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0 -#ifdef ALIGNED_VECTOR - movaps xmm4, [eax] // x4 = vx, vy, vz, X -#else - movups xmm4, [eax] // x4 = vx, vy, vz, X -#endif - movaps xmm1, xmm4 // x1 = x4 - mulps xmm1, xmm4 // x1 = vx * vx, vy * vy, vz * vz, X - movhlps xmm3, xmm1 // x3 = vz * vz, X, X, X - movaps xmm2, xmm1 // x2 = x1 - shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X - addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X - addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X - maxss xmm1, xmm5 // x1 = max( 1.0, x1 ) - rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 ) - movss inv_r2, xmm0 // inv_r2 = x0 - } -#elif POSIX - __asm__ __volatile__( - "movss %0, %%xmm5 \n\t" -#ifdef ALIGNED_VECTOR - "movaps %1, %%xmm4 \n\t" -#else - "movups %1, %%xmm4 \n\t" -#endif - "movaps %%xmm4, %%xmm1 \n\t" - "mulps %%xmm4, %%xmm1 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movaps %%xmm1, %%xmm2 \n\t" - "shufps $1, %%xmm2, %%xmm2 \n\t" - "addss %%xmm2, %%xmm1 \n\t" - "addss %%xmm3, %%xmm1 \n\t" - "maxss %%xmm5, %%xmm1 \n\t" - "rcpss %%xmm1, %%xmm0 \n\t" - "movss %%xmm0, %0 \n\t" - : "=m" (inv_r2) - : "m" (*v), "0" (inv_r2) - ); -#else - #error "Not Implemented" -#endif - - return inv_r2; -} - - -#ifdef POSIX -// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } -#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } - -_PS_CONST_TYPE(sign_mask, int, 0x80000000); -_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); - - -#define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } - -_PI32_CONST(1, 1); -_PI32_CONST(inv1, ~1); -_PI32_CONST(2, 2); -_PI32_CONST(4, 4); -_PI32_CONST(0x7f, 0x7f); -_PS_CONST(1 , 1.0f); -_PS_CONST(0p5, 0.5f); - -_PS_CONST(minus_cephes_DP1, -0.78515625); -_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); -_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); -_PS_CONST(sincof_p0, -1.9515295891E-4); -_PS_CONST(sincof_p1, 8.3321608736E-3); -_PS_CONST(sincof_p2, -1.6666654611E-1); -_PS_CONST(coscof_p0, 2.443315711809948E-005); -_PS_CONST(coscof_p1, -1.388731625493765E-003); -_PS_CONST(coscof_p2, 4.166664568298827E-002); -_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI - -typedef union xmm_mm_union { - __m128 xmm; - __m64 mm[2]; -} xmm_mm_union; - -#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; } - -typedef __m128 v4sf; // vector of 4 float (sse1) -typedef __m64 v2si; // vector of 2 int (mmx) - -#endif - -void _SSE_SinCos(float x, float* s, float* c) -{ -#ifdef _WIN32 - float t4, t8, t12; - - __asm - { - movss xmm0, x - movss t12, xmm0 - movss xmm1, _ps_am_inv_sign_mask - mov eax, t12 - mulss xmm0, _ps_am_2_o_pi - andps xmm0, xmm1 - and eax, 0x80000000 - - cvttss2si edx, xmm0 - mov ecx, edx - mov t12, esi - mov esi, edx - add edx, 0x1 - shl ecx, (31 - 1) - shl edx, (31 - 1) - - movss xmm4, _ps_am_1 - cvtsi2ss xmm3, esi - mov t8, eax - and esi, 0x1 - - subss xmm0, xmm3 - movss xmm3, _sincos_inv_masks[esi * 4] - minss xmm0, xmm4 - - subss xmm4, xmm0 - - movss xmm6, xmm4 - andps xmm4, xmm3 - and ecx, 0x80000000 - movss xmm2, xmm3 - andnps xmm3, xmm0 - and edx, 0x80000000 - movss xmm7, t8 - andps xmm0, xmm2 - mov t8, ecx - mov t4, edx - orps xmm4, xmm3 - - mov eax, s //mov eax, [esp + 4 + 16] - mov edx, c //mov edx, [esp + 4 + 16 + 4] - - andnps xmm2, xmm6 - orps xmm0, xmm2 - - movss xmm2, t8 - movss xmm1, xmm0 - movss xmm5, xmm4 - xorps xmm7, xmm2 - movss xmm3, _ps_sincos_p3 - mulss xmm0, xmm0 - mulss xmm4, xmm4 - movss xmm2, xmm0 - movss xmm6, xmm4 - orps xmm1, xmm7 - movss xmm7, _ps_sincos_p2 - mulss xmm0, xmm3 - mulss xmm4, xmm3 - movss xmm3, _ps_sincos_p1 - addss xmm0, xmm7 - addss xmm4, xmm7 - movss xmm7, _ps_sincos_p0 - mulss xmm0, xmm2 - mulss xmm4, xmm6 - addss xmm0, xmm3 - addss xmm4, xmm3 - movss xmm3, t4 - mulss xmm0, xmm2 - mulss xmm4, xmm6 - orps xmm5, xmm3 - mov esi, t12 - addss xmm0, xmm7 - addss xmm4, xmm7 - mulss xmm0, xmm1 - mulss xmm4, xmm5 - - // use full stores since caller might reload with full loads - movss [eax], xmm0 - movss [edx], xmm4 - } -#elif POSIX - - Assert( "Needs testing, verify impl!\n" ); - - v4sf xx = _mm_load_ss( &x ); - - v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; - v2si mm0, mm1, mm2, mm3, mm4, mm5; - sign_bit_sin = xx; - /* take the absolute value */ - xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); - /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask); - - /* scale by 4/Pi */ - y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); - - /* store the integer part of y in mm2:mm3 */ - xmm3 = _mm_movehl_ps(xmm3, y); - mm2 = _mm_cvttps_pi32(y); - mm3 = _mm_cvttps_pi32(xmm3); - - /* j=(j+1) & (~1) (see the cephes sources) */ - mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); - mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); - - y = _mm_cvtpi32x2_ps(mm2, mm3); - - mm4 = mm2; - mm5 = mm3; - - /* get the swap sign flag for the sine */ - mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); - mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); - mm0 = _mm_slli_pi32(mm0, 29); - mm1 = _mm_slli_pi32(mm1, 29); - v4sf swap_sign_bit_sin; - COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); - - /* get the polynom selection mask for the sine */ - - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); - mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); - mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); - v4sf poly_mask; - COPY_MM_TO_XMM(mm2, mm3, poly_mask); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v4sf*)_ps_minus_cephes_DP1; - xmm2 = *(v4sf*)_ps_minus_cephes_DP2; - xmm3 = *(v4sf*)_ps_minus_cephes_DP3; - xmm1 = _mm_mul_ps(y, xmm1); - xmm2 = _mm_mul_ps(y, xmm2); - xmm3 = _mm_mul_ps(y, xmm3); - xx = _mm_add_ps(xx, xmm1); - xx = _mm_add_ps(xx, xmm2); - xx = _mm_add_ps(xx, xmm3); - - /* get the sign flag for the cosine */ - mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2); - mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2); - mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4); - mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4); - mm4 = _mm_slli_pi32(mm4, 29); - mm5 = _mm_slli_pi32(mm5, 29); - v4sf sign_bit_cos; - COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); - _mm_empty(); /* good-bye mmx */ - - sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); - - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - v4sf z = _mm_mul_ps(xx,xx); - y = *(v4sf*)_ps_coscof_p0; - - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); - y = _mm_mul_ps(y, z); - y = _mm_mul_ps(y, z); - v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); - y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(v4sf*)_ps_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - v4sf y2 = *(v4sf*)_ps_sincof_p0; - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_mul_ps(y2, xx); - y2 = _mm_add_ps(y2, xx); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - v4sf ysin2 = _mm_and_ps(xmm3, y2); - v4sf ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); - y = _mm_sub_ps(y, ysin1); - - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); - - /* update the sign */ - _mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) ); - _mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) ); - -#else - #error "Not Implemented" -#endif -} - -float _SSE_cos( float x ) -{ -#ifdef _WIN32 - float temp; - __asm - { - movss xmm0, x - movss xmm1, _ps_am_inv_sign_mask - andps xmm0, xmm1 - addss xmm0, _ps_am_pi_o_2 - mulss xmm0, _ps_am_2_o_pi - - cvttss2si ecx, xmm0 - movss xmm5, _ps_am_1 - mov edx, ecx - shl edx, (31 - 1) - cvtsi2ss xmm1, ecx - and edx, 0x80000000 - and ecx, 0x1 - - subss xmm0, xmm1 - movss xmm6, _sincos_masks[ecx * 4] - minss xmm0, xmm5 - - movss xmm1, _ps_sincos_p3 - subss xmm5, xmm0 - - andps xmm5, xmm6 - movss xmm7, _ps_sincos_p2 - andnps xmm6, xmm0 - mov temp, edx - orps xmm5, xmm6 - movss xmm0, xmm5 - - mulss xmm5, xmm5 - movss xmm4, _ps_sincos_p1 - movss xmm2, xmm5 - mulss xmm5, xmm1 - movss xmm1, _ps_sincos_p0 - addss xmm5, xmm7 - mulss xmm5, xmm2 - movss xmm3, temp - addss xmm5, xmm4 - mulss xmm5, xmm2 - orps xmm0, xmm3 - addss xmm5, xmm1 - mulss xmm0, xmm5 - - movss x, xmm0 - - } -#elif POSIX - - Assert( "Needs testing, verify impl!\n" ); - - v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; - v2si mm0, mm1, mm2, mm3; - /* take the absolute value */ - v4sf xx = _mm_load_ss( &x ); - - xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); - - /* scale by 4/Pi */ - y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); - - /* store the integer part of y in mm0:mm1 */ - xmm2 = _mm_movehl_ps(xmm2, y); - mm2 = _mm_cvttps_pi32(y); - mm3 = _mm_cvttps_pi32(xmm2); - - /* j=(j+1) & (~1) (see the cephes sources) */ - mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); - mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); - - y = _mm_cvtpi32x2_ps(mm2, mm3); - - - mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2); - mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2); - - /* get the swap sign flag in mm0:mm1 and the - polynom selection mask in mm2:mm3 */ - - mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4); - mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4); - mm0 = _mm_slli_pi32(mm0, 29); - mm1 = _mm_slli_pi32(mm1, 29); - - mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); - mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); - - mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); - mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); - - v4sf sign_bit, poly_mask; - COPY_MM_TO_XMM(mm0, mm1, sign_bit); - COPY_MM_TO_XMM(mm2, mm3, poly_mask); - _mm_empty(); /* good-bye mmx */ - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v4sf*)_ps_minus_cephes_DP1; - xmm2 = *(v4sf*)_ps_minus_cephes_DP2; - xmm3 = *(v4sf*)_ps_minus_cephes_DP3; - xmm1 = _mm_mul_ps(y, xmm1); - xmm2 = _mm_mul_ps(y, xmm2); - xmm3 = _mm_mul_ps(y, xmm3); - xx = _mm_add_ps(xx, xmm1); - xx = _mm_add_ps(xx, xmm2); - xx = _mm_add_ps(xx, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v4sf*)_ps_coscof_p0; - v4sf z = _mm_mul_ps(xx,xx); - - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); - y = _mm_mul_ps(y, z); - y = _mm_mul_ps(y, z); - v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); - y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(v4sf*)_ps_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - v4sf y2 = *(v4sf*)_ps_sincof_p0; - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_mul_ps(y2, xx); - y2 = _mm_add_ps(y2, xx); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - y2 = _mm_and_ps(xmm3, y2); //, xmm3); - y = _mm_andnot_ps(xmm3, y); - y = _mm_add_ps(y,y2); - /* update the sign */ - - _mm_store_ss( &x, _mm_xor_ps(y, sign_bit) ); - -#else - #error "Not Implemented" -#endif - - return x; -} - -//----------------------------------------------------------------------------- -// SSE2 implementations of optimized routines: -//----------------------------------------------------------------------------- -void _SSE2_SinCos(float x, float* s, float* c) // any x -{ -#ifdef _WIN32 - __asm - { - movss xmm0, x - movaps xmm7, xmm0 - movss xmm1, _ps_am_inv_sign_mask - movss xmm2, _ps_am_sign_mask - movss xmm3, _ps_am_2_o_pi - andps xmm0, xmm1 - andps xmm7, xmm2 - mulss xmm0, xmm3 - - pxor xmm3, xmm3 - movd xmm5, _epi32_1 - movss xmm4, _ps_am_1 - - cvttps2dq xmm2, xmm0 - pand xmm5, xmm2 - movd xmm1, _epi32_2 - pcmpeqd xmm5, xmm3 - movd xmm3, _epi32_1 - cvtdq2ps xmm6, xmm2 - paddd xmm3, xmm2 - pand xmm2, xmm1 - pand xmm3, xmm1 - subss xmm0, xmm6 - pslld xmm2, (31 - 1) - minss xmm0, xmm4 - - mov eax, s // mov eax, [esp + 4 + 16] - mov edx, c // mov edx, [esp + 4 + 16 + 4] - - subss xmm4, xmm0 - pslld xmm3, (31 - 1) - - movaps xmm6, xmm4 - xorps xmm2, xmm7 - movaps xmm7, xmm5 - andps xmm6, xmm7 - andnps xmm7, xmm0 - andps xmm0, xmm5 - andnps xmm5, xmm4 - movss xmm4, _ps_sincos_p3 - orps xmm6, xmm7 - orps xmm0, xmm5 - movss xmm5, _ps_sincos_p2 - - movaps xmm1, xmm0 - movaps xmm7, xmm6 - mulss xmm0, xmm0 - mulss xmm6, xmm6 - orps xmm1, xmm2 - orps xmm7, xmm3 - movaps xmm2, xmm0 - movaps xmm3, xmm6 - mulss xmm0, xmm4 - mulss xmm6, xmm4 - movss xmm4, _ps_sincos_p1 - addss xmm0, xmm5 - addss xmm6, xmm5 - movss xmm5, _ps_sincos_p0 - mulss xmm0, xmm2 - mulss xmm6, xmm3 - addss xmm0, xmm4 - addss xmm6, xmm4 - mulss xmm0, xmm2 - mulss xmm6, xmm3 - addss xmm0, xmm5 - addss xmm6, xmm5 - mulss xmm0, xmm1 - mulss xmm6, xmm7 - - // use full stores since caller might reload with full loads - movss [eax], xmm0 - movss [edx], xmm6 - } -#elif POSIX - #warning "_SSE2_SinCos NOT implemented!" - Assert( 0 ); -#else - #error "Not Implemented" -#endif -} - -float _SSE2_cos(float x) -{ -#ifdef _WIN32 - __asm - { - movss xmm0, x - movss xmm1, _ps_am_inv_sign_mask - movss xmm2, _ps_am_pi_o_2 - movss xmm3, _ps_am_2_o_pi - andps xmm0, xmm1 - addss xmm0, xmm2 - mulss xmm0, xmm3 - - pxor xmm3, xmm3 - movd xmm5, _epi32_1 - movss xmm4, _ps_am_1 - cvttps2dq xmm2, xmm0 - pand xmm5, xmm2 - movd xmm1, _epi32_2 - pcmpeqd xmm5, xmm3 - cvtdq2ps xmm6, xmm2 - pand xmm2, xmm1 - pslld xmm2, (31 - 1) - - subss xmm0, xmm6 - movss xmm3, _ps_sincos_p3 - minss xmm0, xmm4 - subss xmm4, xmm0 - andps xmm0, xmm5 - andnps xmm5, xmm4 - orps xmm0, xmm5 - - movaps xmm1, xmm0 - movss xmm4, _ps_sincos_p2 - mulss xmm0, xmm0 - movss xmm5, _ps_sincos_p1 - orps xmm1, xmm2 - movaps xmm7, xmm0 - mulss xmm0, xmm3 - movss xmm6, _ps_sincos_p0 - addss xmm0, xmm4 - mulss xmm0, xmm7 - addss xmm0, xmm5 - mulss xmm0, xmm7 - addss xmm0, xmm6 - mulss xmm0, xmm1 - movss x, xmm0 - } -#elif POSIX - #warning "_SSE2_cos NOT implemented!" - Assert( 0 ); -#else - #error "Not Implemented" -#endif - - return x; -} - -// SSE Version of VectorTransform -void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1) -{ - Assert( s_bMathlibInitialized ); - Assert( in1 != out1 ); - -#ifdef _WIN32 - __asm - { - mov eax, in1; - mov ecx, in2; - mov edx, out1; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - addss xmm0, [ecx+12] - movss [edx], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - addss xmm0, [ecx+12] - movss [edx+4], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - addss xmm0, [ecx+12] - movss [edx+8], xmm0; - } -#elif POSIX - #warning "VectorTransformSSE C implementation only" - out1[0] = DotProduct(in1, in2[0]) + in2[0][3]; - out1[1] = DotProduct(in1, in2[1]) + in2[1][3]; - out1[2] = DotProduct(in1, in2[2]) + in2[2][3]; -#else - #error "Not Implemented" -#endif -} - -void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 ) -{ - Assert( s_bMathlibInitialized ); - Assert( in1 != out1 ); - -#ifdef _WIN32 - __asm - { - mov eax, in1; - mov ecx, in2; - mov edx, out1; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx+4], xmm0; - add ecx, 16; - - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx+8], xmm0; - } -#elif POSIX - #warning "VectorRotateSSE C implementation only" - out1[0] = DotProduct( in1, in2[0] ); - out1[1] = DotProduct( in1, in2[1] ); - out1[2] = DotProduct( in1, in2[2] ); -#else - #error "Not Implemented" -#endif -} - -#ifdef _WIN32 -void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ) -{ - // FIXME: This don't work!! It will overwrite memory in the write to dest - Assert(0); - - Assert( s_bMathlibInitialized ); - _asm { // Intel SSE only routine - mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 - mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 - mov edx, DWORD PTR [esp+0x10] ; *dest - movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 -#ifdef ALIGNED_VECTOR - movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movaps xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movaps [edx], xmm3 ; *dest = x3 -#else - movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movups xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movups [edx], xmm3 ; *dest = x3 -#endif - } -} -#endif - -#ifdef _WIN32 -#ifdef PFN_VECTORMA -void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest ) -{ - // FIXME: This don't work!! It will overwrite memory in the write to dest - Assert(0); - - Assert( s_bMathlibInitialized ); - _asm - { - // Intel SSE only routine - mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 - mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 - mov edx, DWORD PTR [esp+0x10] ; *dest - movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 -#ifdef ALIGNED_VECTOR - movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movaps xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movaps [edx], xmm3 ; *dest = x3 -#else - movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X - pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale - movups xmm1, [eax] ; x1 = start1, start2, start3, X - mulps xmm3, xmm2 ; x3 *= x2 - addps xmm3, xmm1 ; x3 += x1 - movups [edx], xmm3 ; *dest = x3 -#endif - } -} -float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA; -#endif -#endif - - -// SSE DotProduct -- it's a smidgen faster than the asm DotProduct... -// Should be validated too! :) -// NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like -// this, but when inlined, and instruction scheduled, the C version is faster. -// Verified this via VTune -/* -vec_t DotProduct (const vec_t *a, const vec_t *c) -{ - vec_t temp; - - __asm - { - mov eax, a; - mov ecx, c; - mov edx, DWORD PTR [temp] - movss xmm0, [eax]; - mulss xmm0, [ecx]; - movss xmm1, [eax+4]; - mulss xmm1, [ecx+4]; - movss xmm2, [eax+8]; - mulss xmm2, [ecx+8]; - addss xmm0, xmm1; - addss xmm0, xmm2; - movss [edx], xmm0; - fld DWORD PTR [edx]; - ret - } -} -*/ - -#endif // COMPILER_MSVC64 +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: SSE Math primitives. +// +//=====================================================================================// + +#include +#include // Needed for FLT_EPSILON +#include "basetypes.h" +#include +#include "tier0/dbg.h" +#include "mathlib/mathlib.h" +#include "mathlib/vector.h" +#include "sse.h" + +// memdbgon must be the last include file in a .cpp file!!! +#include "tier0/memdbgon.h" + +#ifndef COMPILER_MSVC64 +// Implement for 64-bit Windows if needed. + +static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 }; +static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 }; + +//----------------------------------------------------------------------------- +// Macros and constants required by some of the SSE assembly: +//----------------------------------------------------------------------------- + +#ifdef _WIN32 + #define _PS_EXTERN_CONST(Name, Val) \ + const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } + + #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ + const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \ + + #define _EPI32_CONST(Name, Val) \ + static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val } + + #define _PS_CONST(Name, Val) \ + static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val } +#elif POSIX + #define _PS_EXTERN_CONST(Name, Val) \ + const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } + + #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \ + const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \ + + #define _EPI32_CONST(Name, Val) \ + static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } + + #define _PS_CONST(Name, Val) \ + static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val } +#endif + +_PS_EXTERN_CONST(am_0, 0.0f); +_PS_EXTERN_CONST(am_1, 1.0f); +_PS_EXTERN_CONST(am_m1, -1.0f); +_PS_EXTERN_CONST(am_0p5, 0.5f); +_PS_EXTERN_CONST(am_1p5, 1.5f); +_PS_EXTERN_CONST(am_pi, (float)M_PI); +_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0)); +_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI)); +_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0)); +_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI)); +_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000); +_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000); +_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000); +_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000); +_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000); + +_EPI32_CONST(1, 1); +_EPI32_CONST(2, 2); + +_PS_CONST(sincos_p0, 0.15707963267948963959e1f); +_PS_CONST(sincos_p1, -0.64596409750621907082e0f); +_PS_CONST(sincos_p2, 0.7969262624561800806e-1f); +_PS_CONST(sincos_p3, -0.468175413106023168e-2f); + +#ifdef PFN_VECTORMA +void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ); +#endif + +//----------------------------------------------------------------------------- +// SSE implementations of optimized routines: +//----------------------------------------------------------------------------- +float _SSE_Sqrt(float x) +{ + Assert( s_bMathlibInitialized ); + float root = 0.f; +#ifdef _WIN32 + _asm + { + sqrtss xmm0, x + movss root, xmm0 + } +#elif POSIX + _mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) ); +#endif + return root; +} + +// Single iteration NewtonRaphson reciprocal square root: +// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) +// Very low error, and fine to use in place of 1.f / sqrtf(x). +#if 0 +float _SSE_RSqrtAccurate(float x) +{ + Assert( s_bMathlibInitialized ); + + float rroot; + _asm + { + rsqrtss xmm0, x + movss rroot, xmm0 + } + + return (0.5f * rroot) * (3.f - (x * rroot) * rroot); +} +#else + +#ifdef POSIX +const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value +const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value +#endif + +// Intel / Kipps SSE RSqrt. Significantly faster than above. +float _SSE_RSqrtAccurate(float a) +{ + +#ifdef _WIN32 + float x; + float half = 0.5f; + float three = 3.f; + + __asm + { + movss xmm3, a; + movss xmm1, half; + movss xmm2, three; + rsqrtss xmm0, xmm3; + + mulss xmm3, xmm0; + mulss xmm1, xmm0; + mulss xmm3, xmm0; + subss xmm2, xmm3; + mulss xmm1, xmm2; + + movss x, xmm1; + } + + return x; +#elif POSIX + __m128 xx = _mm_load_ss( &a ); + __m128 xr = _mm_rsqrt_ss( xx ); + __m128 xt; + + xt = _mm_mul_ss( xr, xr ); + xt = _mm_mul_ss( xt, xx ); + xt = _mm_sub_ss( f3, xt ); + xt = _mm_mul_ss( xt, f05 ); + xr = _mm_mul_ss( xr, xt ); + + _mm_store_ss( &a, xr ); + return a; +#else + #error "Not Implemented" +#endif + +} +#endif + +// Simple SSE rsqrt. Usually accurate to around 6 (relative) decimal places +// or so, so ok for closed transforms. (ie, computing lighting normals) +float _SSE_RSqrtFast(float x) +{ + Assert( s_bMathlibInitialized ); + + float rroot; +#ifdef _WIN32 + _asm + { + rsqrtss xmm0, x + movss rroot, xmm0 + } +#elif POSIX + __asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) ); +#else +#error +#endif + + return rroot; +} + +float FASTCALL _SSE_VectorNormalize (Vector& vec) +{ + Assert( s_bMathlibInitialized ); + + // NOTE: This is necessary to prevent an memory overwrite... + // sice vec only has 3 floats, we can't "movaps" directly into it. +#ifdef _WIN32 + __declspec(align(16)) float result[4]; +#elif POSIX + float result[4] __attribute__((aligned(16))); +#endif + + float *v = &vec[0]; + float *r = &result[0]; + + float radius = 0.f; + // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't + // be much of a performance win, considering you will very likely miss 3 branch predicts in a row. + if ( v[0] || v[1] || v[2] ) + { +#ifdef _WIN32 + _asm + { + mov eax, v + mov edx, r +#ifdef ALIGNED_VECTOR + movaps xmm4, [eax] // r4 = vx, vy, vz, X + movaps xmm1, xmm4 // r1 = r4 +#else + movups xmm4, [eax] // r4 = vx, vy, vz, X + movaps xmm1, xmm4 // r1 = r4 +#endif + mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X + movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X + movaps xmm2, xmm1 // r2 = r1 + shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X + addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X + addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X + sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X + movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz)) + rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X + shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X + mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X + movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X + } +#elif POSIX + __asm__ __volatile__( +#ifdef ALIGNED_VECTOR + "movaps %2, %%xmm4 \n\t" + "movaps %%xmm4, %%xmm1 \n\t" +#else + "movups %2, %%xmm4 \n\t" + "movaps %%xmm4, %%xmm1 \n\t" +#endif + "mulps %%xmm4, %%xmm1 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movaps %%xmm1, %%xmm2 \n\t" + "shufps $1, %%xmm2, %%xmm2 \n\t" + "addss %%xmm2, %%xmm1 \n\t" + "addss %%xmm3, %%xmm1 \n\t" + "sqrtss %%xmm1, %%xmm1 \n\t" + "movss %%xmm1, %0 \n\t" + "rcpss %%xmm1, %%xmm1 \n\t" + "shufps $0, %%xmm1, %%xmm1 \n\t" + "mulps %%xmm1, %%xmm4 \n\t" + "movaps %%xmm4, %1 \n\t" + : "=m" (radius), "=m" (result) + : "m" (*v) + ); +#else + #error "Not Implemented" +#endif + vec.x = result[0]; + vec.y = result[1]; + vec.z = result[2]; + + } + + return radius; +} + +void FASTCALL _SSE_VectorNormalizeFast (Vector& vec) +{ + float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z ); + + vec.x *= ool; + vec.y *= ool; + vec.z *= ool; +} + +float _SSE_InvRSquared(const float* v) +{ + float inv_r2 = 1.f; +#ifdef _WIN32 + _asm { // Intel SSE only routine + mov eax, v + movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0 +#ifdef ALIGNED_VECTOR + movaps xmm4, [eax] // x4 = vx, vy, vz, X +#else + movups xmm4, [eax] // x4 = vx, vy, vz, X +#endif + movaps xmm1, xmm4 // x1 = x4 + mulps xmm1, xmm4 // x1 = vx * vx, vy * vy, vz * vz, X + movhlps xmm3, xmm1 // x3 = vz * vz, X, X, X + movaps xmm2, xmm1 // x2 = x1 + shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X + addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X + addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X + maxss xmm1, xmm5 // x1 = max( 1.0, x1 ) + rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 ) + movss inv_r2, xmm0 // inv_r2 = x0 + } +#elif POSIX + __asm__ __volatile__( + "movss %0, %%xmm5 \n\t" +#ifdef ALIGNED_VECTOR + "movaps %1, %%xmm4 \n\t" +#else + "movups %1, %%xmm4 \n\t" +#endif + "movaps %%xmm4, %%xmm1 \n\t" + "mulps %%xmm4, %%xmm1 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movaps %%xmm1, %%xmm2 \n\t" + "shufps $1, %%xmm2, %%xmm2 \n\t" + "addss %%xmm2, %%xmm1 \n\t" + "addss %%xmm3, %%xmm1 \n\t" + "maxss %%xmm5, %%xmm1 \n\t" + "rcpss %%xmm1, %%xmm0 \n\t" + "movss %%xmm0, %0 \n\t" + : "=m" (inv_r2) + : "m" (*v), "0" (inv_r2) + ); +#else + #error "Not Implemented" +#endif + + return inv_r2; +} + + +#ifdef POSIX +// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } +#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } + +_PS_CONST_TYPE(sign_mask, int, 0x80000000); +_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + + +#define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val } + +_PI32_CONST(1, 1); +_PI32_CONST(inv1, ~1); +_PI32_CONST(2, 2); +_PI32_CONST(4, 4); +_PI32_CONST(0x7f, 0x7f); +_PS_CONST(1 , 1.0f); +_PS_CONST(0p5, 0.5f); + +_PS_CONST(minus_cephes_DP1, -0.78515625); +_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS_CONST(sincof_p0, -1.9515295891E-4); +_PS_CONST(sincof_p1, 8.3321608736E-3); +_PS_CONST(sincof_p2, -1.6666654611E-1); +_PS_CONST(coscof_p0, 2.443315711809948E-005); +_PS_CONST(coscof_p1, -1.388731625493765E-003); +_PS_CONST(coscof_p2, 4.166664568298827E-002); +_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + +typedef union xmm_mm_union { + __m128 xmm; + __m64 mm[2]; +} xmm_mm_union; + +#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; } + +typedef __m128 v4sf; // vector of 4 float (sse1) +typedef __m64 v2si; // vector of 2 int (mmx) + +#endif + +void _SSE_SinCos(float x, float* s, float* c) +{ +#ifdef _WIN32 + float t4, t8, t12; + + __asm + { + movss xmm0, x + movss t12, xmm0 + movss xmm1, _ps_am_inv_sign_mask + mov eax, t12 + mulss xmm0, _ps_am_2_o_pi + andps xmm0, xmm1 + and eax, 0x80000000 + + cvttss2si edx, xmm0 + mov ecx, edx + mov t12, esi + mov esi, edx + add edx, 0x1 + shl ecx, (31 - 1) + shl edx, (31 - 1) + + movss xmm4, _ps_am_1 + cvtsi2ss xmm3, esi + mov t8, eax + and esi, 0x1 + + subss xmm0, xmm3 + movss xmm3, _sincos_inv_masks[esi * 4] + minss xmm0, xmm4 + + subss xmm4, xmm0 + + movss xmm6, xmm4 + andps xmm4, xmm3 + and ecx, 0x80000000 + movss xmm2, xmm3 + andnps xmm3, xmm0 + and edx, 0x80000000 + movss xmm7, t8 + andps xmm0, xmm2 + mov t8, ecx + mov t4, edx + orps xmm4, xmm3 + + mov eax, s //mov eax, [esp + 4 + 16] + mov edx, c //mov edx, [esp + 4 + 16 + 4] + + andnps xmm2, xmm6 + orps xmm0, xmm2 + + movss xmm2, t8 + movss xmm1, xmm0 + movss xmm5, xmm4 + xorps xmm7, xmm2 + movss xmm3, _ps_sincos_p3 + mulss xmm0, xmm0 + mulss xmm4, xmm4 + movss xmm2, xmm0 + movss xmm6, xmm4 + orps xmm1, xmm7 + movss xmm7, _ps_sincos_p2 + mulss xmm0, xmm3 + mulss xmm4, xmm3 + movss xmm3, _ps_sincos_p1 + addss xmm0, xmm7 + addss xmm4, xmm7 + movss xmm7, _ps_sincos_p0 + mulss xmm0, xmm2 + mulss xmm4, xmm6 + addss xmm0, xmm3 + addss xmm4, xmm3 + movss xmm3, t4 + mulss xmm0, xmm2 + mulss xmm4, xmm6 + orps xmm5, xmm3 + mov esi, t12 + addss xmm0, xmm7 + addss xmm4, xmm7 + mulss xmm0, xmm1 + mulss xmm4, xmm5 + + // use full stores since caller might reload with full loads + movss [eax], xmm0 + movss [edx], xmm4 + } +#elif POSIX + + Assert( "Needs testing, verify impl!\n" ); + + v4sf xx = _mm_load_ss( &x ); + + v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; + v2si mm0, mm1, mm2, mm3, mm4, mm5; + sign_bit_sin = xx; + /* take the absolute value */ + xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask); + + /* scale by 4/Pi */ + y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); + + /* store the integer part of y in mm2:mm3 */ + xmm3 = _mm_movehl_ps(xmm3, y); + mm2 = _mm_cvttps_pi32(y); + mm3 = _mm_cvttps_pi32(xmm3); + + /* j=(j+1) & (~1) (see the cephes sources) */ + mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); + mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); + mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); + mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); + + y = _mm_cvtpi32x2_ps(mm2, mm3); + + mm4 = mm2; + mm5 = mm3; + + /* get the swap sign flag for the sine */ + mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4); + mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4); + mm0 = _mm_slli_pi32(mm0, 29); + mm1 = _mm_slli_pi32(mm1, 29); + v4sf swap_sign_bit_sin; + COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); + + /* get the polynom selection mask for the sine */ + + mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); + mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); + mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); + mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); + v4sf poly_mask; + COPY_MM_TO_XMM(mm2, mm3, poly_mask); + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = *(v4sf*)_ps_minus_cephes_DP1; + xmm2 = *(v4sf*)_ps_minus_cephes_DP2; + xmm3 = *(v4sf*)_ps_minus_cephes_DP3; + xmm1 = _mm_mul_ps(y, xmm1); + xmm2 = _mm_mul_ps(y, xmm2); + xmm3 = _mm_mul_ps(y, xmm3); + xx = _mm_add_ps(xx, xmm1); + xx = _mm_add_ps(xx, xmm2); + xx = _mm_add_ps(xx, xmm3); + + /* get the sign flag for the cosine */ + mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2); + mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2); + mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4); + mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4); + mm4 = _mm_slli_pi32(mm4, 29); + mm5 = _mm_slli_pi32(mm5, 29); + v4sf sign_bit_cos; + COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); + _mm_empty(); /* good-bye mmx */ + + sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + v4sf z = _mm_mul_ps(xx,xx); + y = *(v4sf*)_ps_coscof_p0; + + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); + y = _mm_mul_ps(y, z); + y = _mm_mul_ps(y, z); + v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); + y = _mm_sub_ps(y, tmp); + y = _mm_add_ps(y, *(v4sf*)_ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + v4sf y2 = *(v4sf*)_ps_sincof_p0; + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_mul_ps(y2, xx); + y2 = _mm_add_ps(y2, xx); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + v4sf ysin2 = _mm_and_ps(xmm3, y2); + v4sf ysin1 = _mm_andnot_ps(xmm3, y); + y2 = _mm_sub_ps(y2,ysin2); + y = _mm_sub_ps(y, ysin1); + + xmm1 = _mm_add_ps(ysin1,ysin2); + xmm2 = _mm_add_ps(y,y2); + + /* update the sign */ + _mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) ); + _mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) ); + +#else + #error "Not Implemented" +#endif +} + +float _SSE_cos( float x ) +{ +#ifdef _WIN32 + float temp; + __asm + { + movss xmm0, x + movss xmm1, _ps_am_inv_sign_mask + andps xmm0, xmm1 + addss xmm0, _ps_am_pi_o_2 + mulss xmm0, _ps_am_2_o_pi + + cvttss2si ecx, xmm0 + movss xmm5, _ps_am_1 + mov edx, ecx + shl edx, (31 - 1) + cvtsi2ss xmm1, ecx + and edx, 0x80000000 + and ecx, 0x1 + + subss xmm0, xmm1 + movss xmm6, _sincos_masks[ecx * 4] + minss xmm0, xmm5 + + movss xmm1, _ps_sincos_p3 + subss xmm5, xmm0 + + andps xmm5, xmm6 + movss xmm7, _ps_sincos_p2 + andnps xmm6, xmm0 + mov temp, edx + orps xmm5, xmm6 + movss xmm0, xmm5 + + mulss xmm5, xmm5 + movss xmm4, _ps_sincos_p1 + movss xmm2, xmm5 + mulss xmm5, xmm1 + movss xmm1, _ps_sincos_p0 + addss xmm5, xmm7 + mulss xmm5, xmm2 + movss xmm3, temp + addss xmm5, xmm4 + mulss xmm5, xmm2 + orps xmm0, xmm3 + addss xmm5, xmm1 + mulss xmm0, xmm5 + + movss x, xmm0 + + } +#elif POSIX + + Assert( "Needs testing, verify impl!\n" ); + + v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; + v2si mm0, mm1, mm2, mm3; + /* take the absolute value */ + v4sf xx = _mm_load_ss( &x ); + + xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask); + + /* scale by 4/Pi */ + y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI); + + /* store the integer part of y in mm0:mm1 */ + xmm2 = _mm_movehl_ps(xmm2, y); + mm2 = _mm_cvttps_pi32(y); + mm3 = _mm_cvttps_pi32(xmm2); + + /* j=(j+1) & (~1) (see the cephes sources) */ + mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1); + mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1); + mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1); + mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1); + + y = _mm_cvtpi32x2_ps(mm2, mm3); + + + mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2); + mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2); + + /* get the swap sign flag in mm0:mm1 and the + polynom selection mask in mm2:mm3 */ + + mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4); + mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4); + mm0 = _mm_slli_pi32(mm0, 29); + mm1 = _mm_slli_pi32(mm1, 29); + + mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2); + mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2); + + mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); + mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); + + v4sf sign_bit, poly_mask; + COPY_MM_TO_XMM(mm0, mm1, sign_bit); + COPY_MM_TO_XMM(mm2, mm3, poly_mask); + _mm_empty(); /* good-bye mmx */ + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = *(v4sf*)_ps_minus_cephes_DP1; + xmm2 = *(v4sf*)_ps_minus_cephes_DP2; + xmm3 = *(v4sf*)_ps_minus_cephes_DP3; + xmm1 = _mm_mul_ps(y, xmm1); + xmm2 = _mm_mul_ps(y, xmm2); + xmm3 = _mm_mul_ps(y, xmm3); + xx = _mm_add_ps(xx, xmm1); + xx = _mm_add_ps(xx, xmm2); + xx = _mm_add_ps(xx, xmm3); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + y = *(v4sf*)_ps_coscof_p0; + v4sf z = _mm_mul_ps(xx,xx); + + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2); + y = _mm_mul_ps(y, z); + y = _mm_mul_ps(y, z); + v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); + y = _mm_sub_ps(y, tmp); + y = _mm_add_ps(y, *(v4sf*)_ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + v4sf y2 = *(v4sf*)_ps_sincof_p0; + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_mul_ps(y2, xx); + y2 = _mm_add_ps(y2, xx); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + y2 = _mm_and_ps(xmm3, y2); //, xmm3); + y = _mm_andnot_ps(xmm3, y); + y = _mm_add_ps(y,y2); + /* update the sign */ + + _mm_store_ss( &x, _mm_xor_ps(y, sign_bit) ); + +#else + #error "Not Implemented" +#endif + + return x; +} + +//----------------------------------------------------------------------------- +// SSE2 implementations of optimized routines: +//----------------------------------------------------------------------------- +void _SSE2_SinCos(float x, float* s, float* c) // any x +{ +#ifdef _WIN32 + __asm + { + movss xmm0, x + movaps xmm7, xmm0 + movss xmm1, _ps_am_inv_sign_mask + movss xmm2, _ps_am_sign_mask + movss xmm3, _ps_am_2_o_pi + andps xmm0, xmm1 + andps xmm7, xmm2 + mulss xmm0, xmm3 + + pxor xmm3, xmm3 + movd xmm5, _epi32_1 + movss xmm4, _ps_am_1 + + cvttps2dq xmm2, xmm0 + pand xmm5, xmm2 + movd xmm1, _epi32_2 + pcmpeqd xmm5, xmm3 + movd xmm3, _epi32_1 + cvtdq2ps xmm6, xmm2 + paddd xmm3, xmm2 + pand xmm2, xmm1 + pand xmm3, xmm1 + subss xmm0, xmm6 + pslld xmm2, (31 - 1) + minss xmm0, xmm4 + + mov eax, s // mov eax, [esp + 4 + 16] + mov edx, c // mov edx, [esp + 4 + 16 + 4] + + subss xmm4, xmm0 + pslld xmm3, (31 - 1) + + movaps xmm6, xmm4 + xorps xmm2, xmm7 + movaps xmm7, xmm5 + andps xmm6, xmm7 + andnps xmm7, xmm0 + andps xmm0, xmm5 + andnps xmm5, xmm4 + movss xmm4, _ps_sincos_p3 + orps xmm6, xmm7 + orps xmm0, xmm5 + movss xmm5, _ps_sincos_p2 + + movaps xmm1, xmm0 + movaps xmm7, xmm6 + mulss xmm0, xmm0 + mulss xmm6, xmm6 + orps xmm1, xmm2 + orps xmm7, xmm3 + movaps xmm2, xmm0 + movaps xmm3, xmm6 + mulss xmm0, xmm4 + mulss xmm6, xmm4 + movss xmm4, _ps_sincos_p1 + addss xmm0, xmm5 + addss xmm6, xmm5 + movss xmm5, _ps_sincos_p0 + mulss xmm0, xmm2 + mulss xmm6, xmm3 + addss xmm0, xmm4 + addss xmm6, xmm4 + mulss xmm0, xmm2 + mulss xmm6, xmm3 + addss xmm0, xmm5 + addss xmm6, xmm5 + mulss xmm0, xmm1 + mulss xmm6, xmm7 + + // use full stores since caller might reload with full loads + movss [eax], xmm0 + movss [edx], xmm6 + } +#elif POSIX + #warning "_SSE2_SinCos NOT implemented!" + Assert( 0 ); +#else + #error "Not Implemented" +#endif +} + +float _SSE2_cos(float x) +{ +#ifdef _WIN32 + __asm + { + movss xmm0, x + movss xmm1, _ps_am_inv_sign_mask + movss xmm2, _ps_am_pi_o_2 + movss xmm3, _ps_am_2_o_pi + andps xmm0, xmm1 + addss xmm0, xmm2 + mulss xmm0, xmm3 + + pxor xmm3, xmm3 + movd xmm5, _epi32_1 + movss xmm4, _ps_am_1 + cvttps2dq xmm2, xmm0 + pand xmm5, xmm2 + movd xmm1, _epi32_2 + pcmpeqd xmm5, xmm3 + cvtdq2ps xmm6, xmm2 + pand xmm2, xmm1 + pslld xmm2, (31 - 1) + + subss xmm0, xmm6 + movss xmm3, _ps_sincos_p3 + minss xmm0, xmm4 + subss xmm4, xmm0 + andps xmm0, xmm5 + andnps xmm5, xmm4 + orps xmm0, xmm5 + + movaps xmm1, xmm0 + movss xmm4, _ps_sincos_p2 + mulss xmm0, xmm0 + movss xmm5, _ps_sincos_p1 + orps xmm1, xmm2 + movaps xmm7, xmm0 + mulss xmm0, xmm3 + movss xmm6, _ps_sincos_p0 + addss xmm0, xmm4 + mulss xmm0, xmm7 + addss xmm0, xmm5 + mulss xmm0, xmm7 + addss xmm0, xmm6 + mulss xmm0, xmm1 + movss x, xmm0 + } +#elif POSIX + #warning "_SSE2_cos NOT implemented!" + Assert( 0 ); +#else + #error "Not Implemented" +#endif + + return x; +} + +// SSE Version of VectorTransform +void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1) +{ + Assert( s_bMathlibInitialized ); + Assert( in1 != out1 ); + +#ifdef _WIN32 + __asm + { + mov eax, in1; + mov ecx, in2; + mov edx, out1; + + movss xmm0, [eax]; + mulss xmm0, [ecx]; + movss xmm1, [eax+4]; + mulss xmm1, [ecx+4]; + movss xmm2, [eax+8]; + mulss xmm2, [ecx+8]; + addss xmm0, xmm1; + addss xmm0, xmm2; + addss xmm0, [ecx+12] + movss [edx], xmm0; + add ecx, 16; + + movss xmm0, [eax]; + mulss xmm0, [ecx]; + movss xmm1, [eax+4]; + mulss xmm1, [ecx+4]; + movss xmm2, [eax+8]; + mulss xmm2, [ecx+8]; + addss xmm0, xmm1; + addss xmm0, xmm2; + addss xmm0, [ecx+12] + movss [edx+4], xmm0; + add ecx, 16; + + movss xmm0, [eax]; + mulss xmm0, [ecx]; + movss xmm1, [eax+4]; + mulss xmm1, [ecx+4]; + movss xmm2, [eax+8]; + mulss xmm2, [ecx+8]; + addss xmm0, xmm1; + addss xmm0, xmm2; + addss xmm0, [ecx+12] + movss [edx+8], xmm0; + } +#elif POSIX + #warning "VectorTransformSSE C implementation only" + out1[0] = DotProduct(in1, in2[0]) + in2[0][3]; + out1[1] = DotProduct(in1, in2[1]) + in2[1][3]; + out1[2] = DotProduct(in1, in2[2]) + in2[2][3]; +#else + #error "Not Implemented" +#endif +} + +void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 ) +{ + Assert( s_bMathlibInitialized ); + Assert( in1 != out1 ); + +#ifdef _WIN32 + __asm + { + mov eax, in1; + mov ecx, in2; + mov edx, out1; + + movss xmm0, [eax]; + mulss xmm0, [ecx]; + movss xmm1, [eax+4]; + mulss xmm1, [ecx+4]; + movss xmm2, [eax+8]; + mulss xmm2, [ecx+8]; + addss xmm0, xmm1; + addss xmm0, xmm2; + movss [edx], xmm0; + add ecx, 16; + + movss xmm0, [eax]; + mulss xmm0, [ecx]; + movss xmm1, [eax+4]; + mulss xmm1, [ecx+4]; + movss xmm2, [eax+8]; + mulss xmm2, [ecx+8]; + addss xmm0, xmm1; + addss xmm0, xmm2; + movss [edx+4], xmm0; + add ecx, 16; + + movss xmm0, [eax]; + mulss xmm0, [ecx]; + movss xmm1, [eax+4]; + mulss xmm1, [ecx+4]; + movss xmm2, [eax+8]; + mulss xmm2, [ecx+8]; + addss xmm0, xmm1; + addss xmm0, xmm2; + movss [edx+8], xmm0; + } +#elif POSIX + #warning "VectorRotateSSE C implementation only" + out1[0] = DotProduct( in1, in2[0] ); + out1[1] = DotProduct( in1, in2[1] ); + out1[2] = DotProduct( in1, in2[2] ); +#else + #error "Not Implemented" +#endif +} + +#ifdef _WIN32 +void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ) +{ + // FIXME: This don't work!! It will overwrite memory in the write to dest + Assert(0); + + Assert( s_bMathlibInitialized ); + _asm { // Intel SSE only routine + mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 + mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 + mov edx, DWORD PTR [esp+0x10] ; *dest + movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 +#ifdef ALIGNED_VECTOR + movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X + pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale + movaps xmm1, [eax] ; x1 = start1, start2, start3, X + mulps xmm3, xmm2 ; x3 *= x2 + addps xmm3, xmm1 ; x3 += x1 + movaps [edx], xmm3 ; *dest = x3 +#else + movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X + pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale + movups xmm1, [eax] ; x1 = start1, start2, start3, X + mulps xmm3, xmm2 ; x3 *= x2 + addps xmm3, xmm1 ; x3 += x1 + movups [edx], xmm3 ; *dest = x3 +#endif + } +} +#endif + +#ifdef _WIN32 +#ifdef PFN_VECTORMA +void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest ) +{ + // FIXME: This don't work!! It will overwrite memory in the write to dest + Assert(0); + + Assert( s_bMathlibInitialized ); + _asm + { + // Intel SSE only routine + mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2 + mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2 + mov edx, DWORD PTR [esp+0x10] ; *dest + movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0 +#ifdef ALIGNED_VECTOR + movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X + pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale + movaps xmm1, [eax] ; x1 = start1, start2, start3, X + mulps xmm3, xmm2 ; x3 *= x2 + addps xmm3, xmm1 ; x3 += x1 + movaps [edx], xmm3 ; *dest = x3 +#else + movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X + pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale + movups xmm1, [eax] ; x1 = start1, start2, start3, X + mulps xmm3, xmm2 ; x3 *= x2 + addps xmm3, xmm1 ; x3 += x1 + movups [edx], xmm3 ; *dest = x3 +#endif + } +} +float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA; +#endif +#endif + + +// SSE DotProduct -- it's a smidgen faster than the asm DotProduct... +// Should be validated too! :) +// NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like +// this, but when inlined, and instruction scheduled, the C version is faster. +// Verified this via VTune +/* +vec_t DotProduct (const vec_t *a, const vec_t *c) +{ + vec_t temp; + + __asm + { + mov eax, a; + mov ecx, c; + mov edx, DWORD PTR [temp] + movss xmm0, [eax]; + mulss xmm0, [ecx]; + movss xmm1, [eax+4]; + mulss xmm1, [ecx+4]; + movss xmm2, [eax+8]; + mulss xmm2, [ecx+8]; + addss xmm0, xmm1; + addss xmm0, xmm2; + movss [edx], xmm0; + fld DWORD PTR [edx]; + ret + } +} +*/ + +#endif // COMPILER_MSVC64 -- cgit v1.2.3